# flake8: noqa: E501 import subprocess import os import tempfile import shutil import argparse import magic import chardet import math def is_ascii(file_path): """ Checks if a file contains only ASCII characters. Args: file_path (str): The path to the file. Returns: bool: True if the file contains only ASCII characters, False otherwise. None: If the file does not exist. """ if not os.path.exists(file_path): return None # Indicate file not found. try: with open(file_path, "r", encoding="ascii") as f: f.read() # Attempt to read the entire file as ASCII return True except UnicodeDecodeError: return False def has_high_entropy(file_path, threshold=0.7): """ Checks if a file has high entropy, which might indicate it's not text. Args: file_path (str): The path to the file. threshold (float): Entropy threshold above which it's considered high entropy. Returns: bool: True if entropy is above the threshold, False otherwise. None: If the file does not exist. """ if not os.path.exists(file_path): return None try: with open(file_path, "rb") as f: # Important: Read as binary data = f.read() except IOError: return True # Treat as non-text if there is an I/O error if not data: return False # empty files considered text entropy = calculate_entropy(data) return entropy > threshold def calculate_entropy(data): """ Calculates the entropy of a byte string. Args: data (bytes): The byte string. Returns: float: The entropy. """ if not data: return 0.0 # Avoid log(0) entropy = 0 data_length = len(data) seen_bytes = bytearray(range(256)) # All possible byte values counts = [0] * 256 for byte in data: counts[byte] += 1 for byte in seen_bytes: probability = float(counts[byte]) / data_length if probability > 0: entropy -= probability * math.log(probability, 2) return entropy def check_chardet_encoding(file_path, confidence_threshold=0.8): """ Checks the file encoding using chardet library. Args: file_path (str): The path to the file. confidence_threshold (float): The minimum confidence level for encoding detection. Returns: bool: True if the encoding is detected with high confidence and is a text encoding, False otherwise. None: If the file does not exist. """ if not os.path.exists(file_path): return None try: with open(file_path, "rb") as f: # Important: Read as binary data = f.read() except IOError: return False # If file can't be opened, assume it's not a simple text file. if not data: return True # Empty files are usually considered text result = chardet.detect(data) encoding = result["encoding"] confidence = result["confidence"] if encoding and confidence > confidence_threshold: # Check if it's a recognized text encoding (not binary or None) if encoding != "binary" and encoding is not None: return True return False def is_text_file(file_path, aggressive=False): """ Wrapper function to check if a file is a text file using multiple methods. Args: file_path (str): The path to the file. aggressive (bool, optional): If True, combines all checks for stricter verification. If False, returns True if any check passes. Defaults to False. Returns: bool: True if the file is a text file, False otherwise. None: If the file does not exist. """ if not os.path.exists(file_path): return None # Basic checks ascii_check = is_ascii(file_path) if ascii_check is None: return None # File not found if aggressive: # Run all checks and require them all to pass high_entropy_check = not has_high_entropy( file_path ) # Invert because we want to know if it DOESN'T have high entropy chardet_check = check_chardet_encoding(file_path) return ascii_check and high_entropy_check and chardet_check else: # Run checks and return True if any of them pass high_entropy_check = not has_high_entropy(file_path) chardet_check = check_chardet_encoding(file_path) return ascii_check or high_entropy_check or chardet_check def get_latest_text_files_to_stdout(remote_repo_url=None, ignored_files=None): """ Checks out the latest commit from a remote Git repository or the current working directory (if no URL is provided) to a temporary folder, and then prints the contents of all files identified as text files to stdout, prepended by their relative paths from the repository root, excluding specified ignored files. Supports "!" to specify includes only. Args: remote_repo_url: The URL of the remote Git repository (optional). If None, the current working directory is assumed to be a Git repo. ignored_files: A list of files or directories to ignore (relative to the repo root). If a list contains a value starting with "!", it means "include only". """ temp_dir = None if ignored_files is None: ignored_files = [] # Ensure .git and .gitignore are always ignored (unless include only is specified) include_only = any(item.startswith("!") for item in ignored_files) if not include_only: ignored_files.extend([".git", ".gitignore"]) ignored_files = list(set(ignored_files)) # remove duplicates # Determine if "include only" is active and extract the include paths include_only = any(item.startswith("!") for item in ignored_files) include_paths = [item[1:] for item in ignored_files if item.startswith("!")] ignore_paths = [item for item in ignored_files if not item.startswith("!")] try: # Create a temporary directory temp_dir = tempfile.mkdtemp() # Clone the repository, but only the latest commit (shallow clone) clone_command = ["git", "clone", "--depth", "1"] if remote_repo_url: clone_command.extend([remote_repo_url, temp_dir]) else: # Check if the current directory is a Git repository. try: subprocess.run( ["git", "rev-parse", "--is-inside-work-tree"], check=True, capture_output=True, text=True, cwd=os.getcwd(), ) # run in current directory except subprocess.CalledProcessError: raise ValueError( "No Git repository URL provided and current directory is not a Git repository." ) clone_command.extend([os.getcwd(), temp_dir]) # clone current dir to temp subprocess.run(clone_command, check=True, capture_output=True, text=True) # Find all files and filter for text files text_files = [] for root, _, files in os.walk(temp_dir): for file in files: file_path = os.path.join(root, file) relative_path = os.path.relpath(file_path, temp_dir) if include_only: # Include only logic include = False for include_path in include_paths: if relative_path.startswith(include_path): include = True break if not include: continue # Skip if not in include paths else: # Ignore logic (standard ignore) ignore = False path_components = relative_path.split( os.sep ) # split based on OS-specific path separator current_path = "" for component in path_components: current_path = ( os.path.join(current_path, component) if current_path else component ) # prevent empty first join if current_path in ignore_paths: ignore = True break if ignore: continue if is_text_file(file_path): # Use the is_text_file function text_files.append(file_path) # Print the contents of each text file, prepended by its relative path for file_path in text_files: relative_path = os.path.relpath(file_path, temp_dir) print(f"--- {relative_path} ---") try: with open(file_path, "r", encoding="utf-8") as f: # Use UTF-8 encoding print(f.read()) except UnicodeDecodeError: print( f"Error: Could not decode file {relative_path} using UTF-8. Skipping file contents." ) # handle binary or other non-UTF-8 encodings print() # Add a blank line between files except subprocess.CalledProcessError as e: print(f"Error executing Git command: {e.stderr}") except ValueError as e: print(e) except Exception as e: print(f"An error occurred: {e}") finally: # Clean up the temporary directory if temp_dir: shutil.rmtree(temp_dir) if __name__ == "__main__": parser = argparse.ArgumentParser( description="Checkout and print text files from a remote Git repository." ) parser.add_argument( "-r", "--repo", required=False, help="The URL of the remote Git repository. If not provided, the current directory is used if it's a Git repository.", ) parser.add_argument( "-i", "--ignored-files", nargs="+", default=[], help="Files or directories to ignore (space-separated). Use ! to specify include only.", ) args = parser.parse_args() remote_repository_url = args.repo ignored_files = args.ignored_files # Verify the URL only if it's provided if remote_repository_url: if ( "github.com" not in remote_repository_url and "gitlab.com" not in remote_repository_url and "bitbucket.org" not in remote_repository_url ): print( "Warning: This script is designed for common public repository hosting providers. Ensure the Git URL is correct." ) get_latest_text_files_to_stdout(remote_repository_url, ignored_files)