pantheon/configs/scripts/git-extract.py

# flake8: noqa: E501
import subprocess
import os
import tempfile
import shutil
import argparse
import magic
import chardet
import math


def is_ascii(file_path):
    """
    Checks if a file contains only ASCII characters.

    Args:
        file_path (str): The path to the file.

    Returns:
        bool: True if the file contains only ASCII characters, False otherwise.
        None: If the file does not exist.
    """
    if not os.path.exists(file_path):
        return None  # Indicate file not found.

    try:
        with open(file_path, "r", encoding="ascii") as f:
            f.read()  # Attempt to read the entire file as ASCII
        return True
    except UnicodeDecodeError:
        return False


def has_high_entropy(file_path, threshold=0.7):
    """
    Checks if a file has high entropy, which might indicate it's not text.

    Args:
        file_path (str): The path to the file.
        threshold (float): Entropy threshold above which it's considered high entropy.

    Returns:
        bool: True if entropy is above the threshold, False otherwise.
        None: If the file does not exist.
    """
    if not os.path.exists(file_path):
        return None

    try:
        with open(file_path, "rb") as f:  # Important: Read as binary
            data = f.read()
    except IOError:
        return True  # Treat as non-text if there is an I/O error

    if not data:
        return False  # empty files considered text

    entropy = calculate_entropy(data)
    return entropy > threshold


def calculate_entropy(data):
    """
    Calculates the entropy of a byte string.

    Args:
        data (bytes): The byte string.

    Returns:
        float: The entropy.
    """
    if not data:
        return 0.0  # Avoid log(0)

    entropy = 0
    data_length = len(data)
    seen_bytes = bytearray(range(256))  # All possible byte values
    counts = [0] * 256

    for byte in data:
        counts[byte] += 1

    for byte in seen_bytes:
        probability = float(counts[byte]) / data_length
        if probability > 0:
            entropy -= probability * math.log(probability, 2)

    return entropy


def check_chardet_encoding(file_path, confidence_threshold=0.8):
    """
    Checks the file encoding using chardet library.

    Args:
        file_path (str): The path to the file.
        confidence_threshold (float): The minimum confidence level for encoding detection.

    Returns:
        bool: True if the encoding is detected with high confidence and is a text encoding, False otherwise.
        None: If the file does not exist.
    """
    if not os.path.exists(file_path):
        return None

    try:
        with open(file_path, "rb") as f:  # Important: Read as binary
            data = f.read()
    except IOError:
        return False  # If file can't be opened, assume it's not a simple text file.

    if not data:
        return True  # Empty files are usually considered text

    result = chardet.detect(data)
    encoding = result["encoding"]
    confidence = result["confidence"]

    if encoding and confidence > confidence_threshold:
        # Check if it's a recognized text encoding (not binary or None)
        if encoding != "binary" and encoding is not None:
            return True
    return False


def is_text_file(file_path, aggressive=False):
    """
    Wrapper function to check if a file is a text file using multiple methods.

    Args:
        file_path (str): The path to the file.
        aggressive (bool, optional): If True, combines all checks for stricter verification.
                         If False, returns True if any check passes. Defaults to False.

    Returns:
        bool: True if the file is a text file, False otherwise.
        None: If the file does not exist.
    """

    if not os.path.exists(file_path):
        return None

    # Basic checks
    ascii_check = is_ascii(file_path)
    if ascii_check is None:
        return None  # File not found

    if aggressive:
        # Run all checks and require them all to pass
        high_entropy_check = not has_high_entropy(
            file_path
        )  # Invert because we want to know if it DOESN'T have high entropy
        chardet_check = check_chardet_encoding(file_path)

        return ascii_check and high_entropy_check and chardet_check
    else:
        # Run checks and return True if any of them pass
        high_entropy_check = not has_high_entropy(file_path)
        chardet_check = check_chardet_encoding(file_path)
        return ascii_check or high_entropy_check or chardet_check


def get_latest_text_files_to_stdout(remote_repo_url=None, ignored_files=None):
    """
    Checks out the latest commit from a remote Git repository or the current
    working directory (if no URL is provided) to a temporary folder,
    and then prints the contents of all files identified as text files to stdout,
    prepended by their relative paths from the repository root, excluding specified
    ignored files.  Supports "!" to specify includes only.

    Args:
        remote_repo_url: The URL of the remote Git repository (optional). If None,
                         the current working directory is assumed to be a Git repo.
        ignored_files: A list of files or directories to ignore (relative to the repo root).
                       If a list contains a value starting with "!", it means "include only".
    """

    temp_dir = None
    if ignored_files is None:
        ignored_files = []

    # Ensure .git and .gitignore are always ignored (unless include only is specified)
    include_only = any(item.startswith("!") for item in ignored_files)
    if not include_only:
        ignored_files.extend([".git", ".gitignore"])
        ignored_files = list(set(ignored_files))  # remove duplicates

    # Determine if "include only" is active and extract the include paths
    include_only = any(item.startswith("!") for item in ignored_files)
    include_paths = [item[1:] for item in ignored_files if item.startswith("!")]
    ignore_paths = [item for item in ignored_files if not item.startswith("!")]


    try:
        # Create a temporary directory
        temp_dir = tempfile.mkdtemp()

        # Clone the repository, but only the latest commit (shallow clone)
        clone_command = ["git", "clone", "--depth", "1"]
        if remote_repo_url:
            clone_command.extend([remote_repo_url, temp_dir])
        else:
            # Check if the current directory is a Git repository.
            try:
                subprocess.run(
                    ["git", "rev-parse", "--is-inside-work-tree"],
                    check=True,
                    capture_output=True,
                    text=True,
                    cwd=os.getcwd(),
                )  # run in current directory
            except subprocess.CalledProcessError:
                raise ValueError(
                    "No Git repository URL provided and current directory is not a Git repository."
                )
            clone_command.extend([os.getcwd(), temp_dir])  # clone current dir to temp

        subprocess.run(clone_command, check=True, capture_output=True, text=True)

        # Find all files and filter for text files
        text_files = []
        for root, _, files in os.walk(temp_dir):
            for file in files:
                file_path = os.path.join(root, file)
                relative_path = os.path.relpath(file_path, temp_dir)

                if include_only:
                    # Include only logic
                    include = False
                    for include_path in include_paths:
                        if relative_path.startswith(include_path):
                            include = True
                            break
                    if not include:
                        continue  # Skip if not in include paths
                else:
                    # Ignore logic (standard ignore)
                    ignore = False
                    path_components = relative_path.split(
                        os.sep
                    )  # split based on OS-specific path separator
                    current_path = ""
                    for component in path_components:
                        current_path = (
                            os.path.join(current_path, component)
                            if current_path
                            else component
                        )  # prevent empty first join
                        if current_path in ignore_paths:
                            ignore = True
                            break
                    if ignore:
                        continue

                if is_text_file(file_path):  # Use the is_text_file function
                    text_files.append(file_path)

        # Print the contents of each text file, prepended by its relative path
        for file_path in text_files:
            relative_path = os.path.relpath(file_path, temp_dir)
            print(f"--- {relative_path} ---")
            try:
                with open(file_path, "r", encoding="utf-8") as f:  # Use UTF-8 encoding
                    print(f.read())
            except UnicodeDecodeError:
                print(
                    f"Error: Could not decode file {relative_path} using UTF-8.  Skipping file contents."
                )  # handle binary or other non-UTF-8 encodings
            print()  # Add a blank line between files

    except subprocess.CalledProcessError as e:
        print(f"Error executing Git command: {e.stderr}")
    except ValueError as e:
        print(e)
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        # Clean up the temporary directory
        if temp_dir:
            shutil.rmtree(temp_dir)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Checkout and print text files from a remote Git repository."
    )
    parser.add_argument(
        "-r",
        "--repo",
        required=False,
        help="The URL of the remote Git repository. If not provided, the current directory is used if it's a Git repository.",
    )
    parser.add_argument(
        "-i",
        "--ignored-files",
        nargs="+",
        default=[],
        help="Files or directories to ignore (space-separated).  Use !<path> to specify include only.",
    )

    args = parser.parse_args()

    remote_repository_url = args.repo
    ignored_files = args.ignored_files

    # Verify the URL only if it's provided
    if remote_repository_url:
        if (
            "github.com" not in remote_repository_url
            and "gitlab.com" not in remote_repository_url
            and "bitbucket.org" not in remote_repository_url
        ):
            print(
                "Warning: This script is designed for common public repository hosting providers. Ensure the Git URL is correct."
            )

    get_latest_text_files_to_stdout(remote_repository_url, ignored_files)