From aeb847aa47dad1496c087ab07b21938353c836f7 Mon Sep 17 00:00:00 2001 From: Mohammad Rafiq Date: Thu, 6 Mar 2025 18:05:48 +0800 Subject: [PATCH] feat: add git-extract script to home --- users/rafiq/default.nix | 3 +- users/rafiq/scripts/default.nix | 5 + users/rafiq/scripts/git-extract.nix | 12 ++ users/rafiq/scripts/git-extract.py | 276 ++++++++++++++++++++++++++++ 4 files changed, 295 insertions(+), 1 deletion(-) create mode 100644 users/rafiq/scripts/default.nix create mode 100644 users/rafiq/scripts/git-extract.nix create mode 100644 users/rafiq/scripts/git-extract.py diff --git a/users/rafiq/default.nix b/users/rafiq/default.nix index 07f4da0..8f102cc 100644 --- a/users/rafiq/default.nix +++ b/users/rafiq/default.nix @@ -1,4 +1,4 @@ -_: { +{ pkgs, ... }: { imports = [ ./modules/git.nix # git specific configs ./modules/sh.nix # bash and other shell specific configs @@ -8,6 +8,7 @@ _: { ./modules/hyprland.nix # hyprland settings ./modules/fonts.nix # font settings ./modules/utils.nix # miscellaneous utilities + ./scripts ]; # This enables using home-manager from the command line. diff --git a/users/rafiq/scripts/default.nix b/users/rafiq/scripts/default.nix new file mode 100644 index 0000000..16c1514 --- /dev/null +++ b/users/rafiq/scripts/default.nix @@ -0,0 +1,5 @@ +{ + imports = [ + ./git-extract.nix + ]; +} diff --git a/users/rafiq/scripts/git-extract.nix b/users/rafiq/scripts/git-extract.nix new file mode 100644 index 0000000..27fb4bf --- /dev/null +++ b/users/rafiq/scripts/git-extract.nix @@ -0,0 +1,12 @@ +{ pkgs, ... }: { + home.packages = [ + (pkgs.writers.writePython3Bin "git-extract" { + + libraries = with pkgs.python3Packages; [ + magic + chardet + ]; + + } (builtins.readFile ./git-extract.py)) + ]; +} diff --git a/users/rafiq/scripts/git-extract.py b/users/rafiq/scripts/git-extract.py new file mode 100644 index 0000000..7c769de --- /dev/null +++ b/users/rafiq/scripts/git-extract.py @@ -0,0 +1,276 @@ +# flake8: noqa: E501 +import subprocess +import os +import tempfile +import shutil +import argparse +import magic +import chardet +import math + + +def is_ascii(file_path): + """ + Checks if a file contains only ASCII characters. + + Args: + file_path (str): The path to the file. + + Returns: + bool: True if the file contains only ASCII characters, False otherwise. + None: If the file does not exist. + """ + if not os.path.exists(file_path): + return None # Indicate file not found. + + try: + with open(file_path, "r", encoding="ascii") as f: + f.read() # Attempt to read the entire file as ASCII + return True + except UnicodeDecodeError: + return False + + +def has_high_entropy(file_path, threshold=0.7): + """ + Checks if a file has high entropy, which might indicate it's not text. + + Args: + file_path (str): The path to the file. + threshold (float): Entropy threshold above which it's considered high entropy. + + Returns: + bool: True if entropy is above the threshold, False otherwise. + None: If the file does not exist. + """ + if not os.path.exists(file_path): + return None + + try: + with open(file_path, "rb") as f: # Important: Read as binary + data = f.read() + except IOError: + return True # Treat as non-text if there is an I/O error + + if not data: + return False # empty files considered text + + entropy = calculate_entropy(data) + return entropy > threshold + + +def calculate_entropy(data): + """ + Calculates the entropy of a byte string. + + Args: + data (bytes): The byte string. + + Returns: + float: The entropy. + """ + if not data: + return 0.0 # Avoid log(0) + + entropy = 0 + data_length = len(data) + seen_bytes = bytearray(range(256)) # All possible byte values + counts = [0] * 256 + + for byte in data: + counts[byte] += 1 + + for byte in seen_bytes: + probability = float(counts[byte]) / data_length + if probability > 0: + entropy -= probability * math.log(probability, 2) + + return entropy + + +def check_chardet_encoding(file_path, confidence_threshold=0.8): + """ + Checks the file encoding using chardet library. + + Args: + file_path (str): The path to the file. + confidence_threshold (float): The minimum confidence level for encoding detection. + + Returns: + bool: True if the encoding is detected with high confidence and is a text encoding, False otherwise. + None: If the file does not exist. + """ + if not os.path.exists(file_path): + return None + + try: + with open(file_path, "rb") as f: # Important: Read as binary + data = f.read() + except IOError: + return False # If file can't be opened, assume it's not a simple text file. + + if not data: + return True # Empty files are usually considered text + + result = chardet.detect(data) + encoding = result["encoding"] + confidence = result["confidence"] + + if encoding and confidence > confidence_threshold: + # Check if it's a recognized text encoding (not binary or None) + if encoding != "binary" and encoding is not None: + return True + return False + + +def is_text_file(file_path, aggressive=False): + """ + Wrapper function to check if a file is a text file using multiple methods. + + Args: + file_path (str): The path to the file. + aggressive (bool, optional): If True, combines all checks for stricter verification. + If False, returns True if any check passes. Defaults to False. + + Returns: + bool: True if the file is a text file, False otherwise. + None: If the file does not exist. + """ + + if not os.path.exists(file_path): + return None + + # Basic checks + ascii_check = is_ascii(file_path) + if ascii_check is None: + return None # File not found + + if aggressive: + # Run all checks and require them all to pass + high_entropy_check = not has_high_entropy( + file_path + ) # Invert because we want to know if it DOESN'T have high entropy + chardet_check = check_chardet_encoding(file_path) + + return ascii_check and high_entropy_check and chardet_check + else: + # Run checks and return True if any of them pass + high_entropy_check = not has_high_entropy(file_path) + chardet_check = check_chardet_encoding(file_path) + return ascii_check or high_entropy_check or chardet_chec + + +def get_latest_text_files_to_stdout(remote_repo_url, ignored_files=None): + """ + Checks out the latest commit from a remote Git repository to a temporary folder, + and then prints the contents of all files identified as text files to stdout, + prepended by their relative paths from the repository root, excluding specified + ignored files. + + Args: + remote_repo_url: The URL of the remote Git repository. + ignored_files: A list of files or directories to ignore (relative to the repo root). + """ + + temp_dir = None + if ignored_files is None: + ignored_files = [] + + # Ensure .git and .gitignore are always ignored + ignored_files.extend([".git", ".gitignore"]) + ignored_files = list(set(ignored_files)) # remove duplicates + + try: + # Create a temporary directory + temp_dir = tempfile.mkdtemp() + + # Clone the repository, but only the latest commit (shallow clone) + subprocess.run( + ["git", "clone", "--depth", "1", remote_repo_url, temp_dir], + check=True, + capture_output=True, + text=True, + ) + + # Find all files and filter for text files + text_files = [] + for root, _, files in os.walk(temp_dir): + for file in files: + file_path = os.path.join(root, file) + relative_path = os.path.relpath(file_path, temp_dir) + + # Check if the file or any of its parent directories are ignored + ignore = False + path_components = relative_path.split( + os.sep + ) # split based on OS-specific path separator + current_path = "" + for component in path_components: + current_path = ( + os.path.join(current_path, component) + if current_path + else component + ) # prevent empty first join + if current_path in ignored_files: + ignore = True + break + + if not ignore: + if is_text_file(file_path): # Use the is_text_file function + text_files.append(file_path) + + # Print the contents of each text file, prepended by its relative path + for file_path in text_files: + relative_path = os.path.relpath(file_path, temp_dir) + print(f"--- {relative_path} ---") + try: + with open(file_path, "r", encoding="utf-8") as f: # Use UTF-8 encoding + print(f.read()) + except UnicodeDecodeError: + print( + f"Error: Could not decode file {relative_path} using UTF-8. Skipping file contents." + ) # handle binary or other non-UTF-8 encodings + print() # Add a blank line between files + + except subprocess.CalledProcessError as e: + print(f"Error executing Git command: {e.stderr}") + except Exception as e: + print(f"An error occurred: {e}") + finally: + # Clean up the temporary directory + if temp_dir: + shutil.rmtree(temp_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Checkout and print text files from a remote Git repository." + ) + parser.add_argument( + "-r", "--repo", required=True, help="The URL of the remote Git repository." + ) + parser.add_argument( + "-i", + "--ignored-files", + nargs="+", + default=[], + help="Files or directories to ignore (space-separated).", + ) + + args = parser.parse_args() + + remote_repository_url = args.repo + ignored_files = args.ignored_files + + # Verify the URL + if ( + "github.com" not in remote_repository_url + and "gitlab.com" not in remote_repository_url + and "bitbucket.org" not in remote_repository_url + ): + print( + "Warning: This script is designed for common public repository hosting providers. Ensure the Git URL is correct." + ) + + get_latest_text_files_to_stdout(remote_repository_url, ignored_files) +