# flake8: noqa: E501 import subprocess import os import tempfile import shutil import argparse import magic import chardet import math def is_ascii(file_path): """ Checks if a file contains only ASCII characters. Args: file_path (str): The path to the file. Returns: bool: True if the file contains only ASCII characters, False otherwise. None: If the file does not exist. """ if not os.path.exists(file_path): return None # Indicate file not found. try: with open(file_path, "r", encoding="ascii") as f: f.read() # Attempt to read the entire file as ASCII return True except UnicodeDecodeError: return False def has_high_entropy(file_path, threshold=0.7): """ Checks if a file has high entropy, which might indicate it's not text. Args: file_path (str): The path to the file. threshold (float): Entropy threshold above which it's considered high entropy. Returns: bool: True if entropy is above the threshold, False otherwise. None: If the file does not exist. """ if not os.path.exists(file_path): return None try: with open(file_path, "rb") as f: # Important: Read as binary data = f.read() except IOError: return True # Treat as non-text if there is an I/O error if not data: return False # empty files considered text entropy = calculate_entropy(data) return entropy > threshold def calculate_entropy(data): """ Calculates the entropy of a byte string. Args: data (bytes): The byte string. Returns: float: The entropy. """ if not data: return 0.0 # Avoid log(0) entropy = 0 data_length = len(data) seen_bytes = bytearray(range(256)) # All possible byte values counts = [0] * 256 for byte in data: counts[byte] += 1 for byte in seen_bytes: probability = float(counts[byte]) / data_length if probability > 0: entropy -= probability * math.log(probability, 2) return entropy def check_chardet_encoding(file_path, confidence_threshold=0.8): """ Checks the file encoding using chardet library. Args: file_path (str): The path to the file. confidence_threshold (float): The minimum confidence level for encoding detection. Returns: bool: True if the encoding is detected with high confidence and is a text encoding, False otherwise. None: If the file does not exist. """ if not os.path.exists(file_path): return None try: with open(file_path, "rb") as f: # Important: Read as binary data = f.read() except IOError: return False # If file can't be opened, assume it's not a simple text file. if not data: return True # Empty files are usually considered text result = chardet.detect(data) encoding = result["encoding"] confidence = result["confidence"] if encoding and confidence > confidence_threshold: # Check if it's a recognized text encoding (not binary or None) if encoding != "binary" and encoding is not None: return True return False def is_text_file(file_path, aggressive=False): """ Wrapper function to check if a file is a text file using multiple methods. Args: file_path (str): The path to the file. aggressive (bool, optional): If True, combines all checks for stricter verification. If False, returns True if any check passes. Defaults to False. Returns: bool: True if the file is a text file, False otherwise. None: If the file does not exist. """ if not os.path.exists(file_path): return None # Basic checks ascii_check = is_ascii(file_path) if ascii_check is None: return None # File not found if aggressive: # Run all checks and require them all to pass high_entropy_check = not has_high_entropy( file_path ) # Invert because we want to know if it DOESN'T have high entropy chardet_check = check_chardet_encoding(file_path) return ascii_check and high_entropy_check and chardet_check else: # Run checks and return True if any of them pass high_entropy_check = not has_high_entropy(file_path) chardet_check = check_chardet_encoding(file_path) return ascii_check or high_entropy_check or chardet_check def get_latest_text_files_to_stdout(remote_repo_url=None, ignored_files=None): """ Checks out the latest commit from a remote Git repository or the current working directory (if no URL is provided) to a temporary folder, and then prints the contents of all files identified as text files to stdout, prepended by their relative paths from the repository root, excluding specified ignored files. Args: remote_repo_url: The URL of the remote Git repository (optional). If None, the current working directory is assumed to be a Git repo. ignored_files: A list of files or directories to ignore (relative to the repo root). """ temp_dir = None if ignored_files is None: ignored_files = [] # Ensure .git and .gitignore are always ignored ignored_files.extend([".git", ".gitignore"]) ignored_files = list(set(ignored_files)) # remove duplicates try: # Create a temporary directory temp_dir = tempfile.mkdtemp() # Clone the repository, but only the latest commit (shallow clone) clone_command = ["git", "clone", "--depth", "1"] if remote_repo_url: clone_command.extend([remote_repo_url, temp_dir]) else: # Check if the current directory is a Git repository. try: subprocess.run( ["git", "rev-parse", "--is-inside-work-tree"], check=True, capture_output=True, text=True, cwd=os.getcwd(), ) # run in current directory except subprocess.CalledProcessError: raise ValueError( "No Git repository URL provided and current directory is not a Git repository." ) clone_command.extend([os.getcwd(), temp_dir]) # clone current dir to temp subprocess.run(clone_command, check=True, capture_output=True, text=True) # Find all files and filter for text files text_files = [] for root, _, files in os.walk(temp_dir): for file in files: file_path = os.path.join(root, file) relative_path = os.path.relpath(file_path, temp_dir) # Check if the file or any of its parent directories are ignored ignore = False path_components = relative_path.split( os.sep ) # split based on OS-specific path separator current_path = "" for component in path_components: current_path = ( os.path.join(current_path, component) if current_path else component ) # prevent empty first join if current_path in ignored_files: ignore = True break if not ignore: if is_text_file(file_path): # Use the is_text_file function text_files.append(file_path) # Print the contents of each text file, prepended by its relative path for file_path in text_files: relative_path = os.path.relpath(file_path, temp_dir) print(f"--- {relative_path} ---") try: with open(file_path, "r", encoding="utf-8") as f: # Use UTF-8 encoding print(f.read()) except UnicodeDecodeError: print( f"Error: Could not decode file {relative_path} using UTF-8. Skipping file contents." ) # handle binary or other non-UTF-8 encodings print() # Add a blank line between files except subprocess.CalledProcessError as e: print(f"Error executing Git command: {e.stderr}") except ValueError as e: print(e) except Exception as e: print(f"An error occurred: {e}") finally: # Clean up the temporary directory if temp_dir: shutil.rmtree(temp_dir) if __name__ == "__main__": parser = argparse.ArgumentParser( description="Checkout and print text files from a remote Git repository." ) parser.add_argument( "-r", "--repo", required=False, help="The URL of the remote Git repository. If not provided, the current directory is used if it's a Git repository.", ) parser.add_argument( "-i", "--ignored-files", nargs="+", default=[], help="Files or directories to ignore (space-separated).", ) args = parser.parse_args() remote_repository_url = args.repo ignored_files = args.ignored_files # Verify the URL only if it's provided if remote_repository_url: if ( "github.com" not in remote_repository_url and "gitlab.com" not in remote_repository_url and "bitbucket.org" not in remote_repository_url ): print( "Warning: This script is designed for common public repository hosting providers. Ensure the Git URL is correct." ) get_latest_text_files_to_stdout(remote_repository_url, ignored_files)