pantheon/configs/scripts/git-extract.py
2025-03-27 03:19:25 +08:00

317 lines
11 KiB
Python

# flake8: noqa: E501
import subprocess
import os
import tempfile
import shutil
import argparse
import magic
import chardet
import math
def is_ascii(file_path):
"""
Checks if a file contains only ASCII characters.
Args:
file_path (str): The path to the file.
Returns:
bool: True if the file contains only ASCII characters, False otherwise.
None: If the file does not exist.
"""
if not os.path.exists(file_path):
return None # Indicate file not found.
try:
with open(file_path, "r", encoding="ascii") as f:
f.read() # Attempt to read the entire file as ASCII
return True
except UnicodeDecodeError:
return False
def has_high_entropy(file_path, threshold=0.7):
"""
Checks if a file has high entropy, which might indicate it's not text.
Args:
file_path (str): The path to the file.
threshold (float): Entropy threshold above which it's considered high entropy.
Returns:
bool: True if entropy is above the threshold, False otherwise.
None: If the file does not exist.
"""
if not os.path.exists(file_path):
return None
try:
with open(file_path, "rb") as f: # Important: Read as binary
data = f.read()
except IOError:
return True # Treat as non-text if there is an I/O error
if not data:
return False # empty files considered text
entropy = calculate_entropy(data)
return entropy > threshold
def calculate_entropy(data):
"""
Calculates the entropy of a byte string.
Args:
data (bytes): The byte string.
Returns:
float: The entropy.
"""
if not data:
return 0.0 # Avoid log(0)
entropy = 0
data_length = len(data)
seen_bytes = bytearray(range(256)) # All possible byte values
counts = [0] * 256
for byte in data:
counts[byte] += 1
for byte in seen_bytes:
probability = float(counts[byte]) / data_length
if probability > 0:
entropy -= probability * math.log(probability, 2)
return entropy
def check_chardet_encoding(file_path, confidence_threshold=0.8):
"""
Checks the file encoding using chardet library.
Args:
file_path (str): The path to the file.
confidence_threshold (float): The minimum confidence level for encoding detection.
Returns:
bool: True if the encoding is detected with high confidence and is a text encoding, False otherwise.
None: If the file does not exist.
"""
if not os.path.exists(file_path):
return None
try:
with open(file_path, "rb") as f: # Important: Read as binary
data = f.read()
except IOError:
return False # If file can't be opened, assume it's not a simple text file.
if not data:
return True # Empty files are usually considered text
result = chardet.detect(data)
encoding = result["encoding"]
confidence = result["confidence"]
if encoding and confidence > confidence_threshold:
# Check if it's a recognized text encoding (not binary or None)
if encoding != "binary" and encoding is not None:
return True
return False
def is_text_file(file_path, aggressive=False):
"""
Wrapper function to check if a file is a text file using multiple methods.
Args:
file_path (str): The path to the file.
aggressive (bool, optional): If True, combines all checks for stricter verification.
If False, returns True if any check passes. Defaults to False.
Returns:
bool: True if the file is a text file, False otherwise.
None: If the file does not exist.
"""
if not os.path.exists(file_path):
return None
# Basic checks
ascii_check = is_ascii(file_path)
if ascii_check is None:
return None # File not found
if aggressive:
# Run all checks and require them all to pass
high_entropy_check = not has_high_entropy(
file_path
) # Invert because we want to know if it DOESN'T have high entropy
chardet_check = check_chardet_encoding(file_path)
return ascii_check and high_entropy_check and chardet_check
else:
# Run checks and return True if any of them pass
high_entropy_check = not has_high_entropy(file_path)
chardet_check = check_chardet_encoding(file_path)
return ascii_check or high_entropy_check or chardet_check
def get_latest_text_files_to_stdout(remote_repo_url=None, ignored_files=None):
"""
Checks out the latest commit from a remote Git repository or the current
working directory (if no URL is provided) to a temporary folder,
and then prints the contents of all files identified as text files to stdout,
prepended by their relative paths from the repository root, excluding specified
ignored files. Supports "!" to specify includes only.
Args:
remote_repo_url: The URL of the remote Git repository (optional). If None,
the current working directory is assumed to be a Git repo.
ignored_files: A list of files or directories to ignore (relative to the repo root).
If a list contains a value starting with "!", it means "include only".
"""
temp_dir = None
if ignored_files is None:
ignored_files = []
# Ensure .git and .gitignore are always ignored (unless include only is specified)
include_only = any(item.startswith("!") for item in ignored_files)
if not include_only:
ignored_files.extend([".git", ".gitignore"])
ignored_files = list(set(ignored_files)) # remove duplicates
# Determine if "include only" is active and extract the include paths
include_only = any(item.startswith("!") for item in ignored_files)
include_paths = [item[1:] for item in ignored_files if item.startswith("!")]
ignore_paths = [item for item in ignored_files if not item.startswith("!")]
try:
# Create a temporary directory
temp_dir = tempfile.mkdtemp()
# Clone the repository, but only the latest commit (shallow clone)
clone_command = ["git", "clone", "--depth", "1"]
if remote_repo_url:
clone_command.extend([remote_repo_url, temp_dir])
else:
# Check if the current directory is a Git repository.
try:
subprocess.run(
["git", "rev-parse", "--is-inside-work-tree"],
check=True,
capture_output=True,
text=True,
cwd=os.getcwd(),
) # run in current directory
except subprocess.CalledProcessError:
raise ValueError(
"No Git repository URL provided and current directory is not a Git repository."
)
clone_command.extend([os.getcwd(), temp_dir]) # clone current dir to temp
subprocess.run(clone_command, check=True, capture_output=True, text=True)
# Find all files and filter for text files
text_files = []
for root, _, files in os.walk(temp_dir):
for file in files:
file_path = os.path.join(root, file)
relative_path = os.path.relpath(file_path, temp_dir)
if include_only:
# Include only logic
include = False
for include_path in include_paths:
if relative_path.startswith(include_path):
include = True
break
if not include:
continue # Skip if not in include paths
else:
# Ignore logic (standard ignore)
ignore = False
path_components = relative_path.split(
os.sep
) # split based on OS-specific path separator
current_path = ""
for component in path_components:
current_path = (
os.path.join(current_path, component)
if current_path
else component
) # prevent empty first join
if current_path in ignore_paths:
ignore = True
break
if ignore:
continue
if is_text_file(file_path): # Use the is_text_file function
text_files.append(file_path)
# Print the contents of each text file, prepended by its relative path
for file_path in text_files:
relative_path = os.path.relpath(file_path, temp_dir)
print(f"--- {relative_path} ---")
try:
with open(file_path, "r", encoding="utf-8") as f: # Use UTF-8 encoding
print(f.read())
except UnicodeDecodeError:
print(
f"Error: Could not decode file {relative_path} using UTF-8. Skipping file contents."
) # handle binary or other non-UTF-8 encodings
print() # Add a blank line between files
except subprocess.CalledProcessError as e:
print(f"Error executing Git command: {e.stderr}")
except ValueError as e:
print(e)
except Exception as e:
print(f"An error occurred: {e}")
finally:
# Clean up the temporary directory
if temp_dir:
shutil.rmtree(temp_dir)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Checkout and print text files from a remote Git repository."
)
parser.add_argument(
"-r",
"--repo",
required=False,
help="The URL of the remote Git repository. If not provided, the current directory is used if it's a Git repository.",
)
parser.add_argument(
"-i",
"--ignored-files",
nargs="+",
default=[],
help="Files or directories to ignore (space-separated). Use !<path> to specify include only.",
)
args = parser.parse_args()
remote_repository_url = args.repo
ignored_files = args.ignored_files
# Verify the URL only if it's provided
if remote_repository_url:
if (
"github.com" not in remote_repository_url
and "gitlab.com" not in remote_repository_url
and "bitbucket.org" not in remote_repository_url
):
print(
"Warning: This script is designed for common public repository hosting providers. Ensure the Git URL is correct."
)
get_latest_text_files_to_stdout(remote_repository_url, ignored_files)