localgenai/testing/qwen3-coder-30b/find_duplicates.py

#!/usr/bin/env python3
"""
Find duplicate files in a directory tree by content hash, with parallelization support.
"""

import hashlib
import os
import sys
from collections import defaultdict
from pathlib import Path
from multiprocessing import Pool, cpu_count
from typing import Dict, List, Tuple

def get_file_hash(filepath: Path, chunk_size: int = 8192) -> str:
    """
    Calculate the SHA256 hash of a file.

    Args:
        filepath: Path to the file
        chunk_size: Size of chunks to read at a time

    Returns:
        SHA256 hash of the file as a hexadecimal string
    """
    hash_sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            # Read file in chunks to handle large files efficiently
            for chunk in iter(lambda: f.read(chunk_size), b""):
                hash_sha256.update(chunk)
        return hash_sha256.hexdigest()
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return ""

def process_file(filepath: Path) -> Tuple[str, Path]:
    """
    Process a single file and return its hash and path.

    Args:
        filepath: Path to the file

    Returns:
        Tuple of (file_hash, file_path)
    """
    file_hash = get_file_hash(filepath)
    return (file_hash, filepath)

def find_duplicates_parallel(root_dir: Path, num_processes: int = None) -> Dict[str, List[Path]]:
    """
    Find duplicate files in a directory tree by content hash using parallel processing.

    Args:
        root_dir: Root directory to search
        num_processes: Number of processes to use (default: CPU count)

    Returns:
        Dictionary mapping file hashes to lists of file paths with that hash
    """
    if num_processes is None:
        num_processes = cpu_count()

    # Get all files in the directory tree
    files = []
    for file_path in root_dir.rglob('*'):
        if file_path.is_file():
            files.append(file_path)

    print(f"Found {len(files)} files to process")

    # Process files in parallel
    with Pool(processes=num_processes) as pool:
        results = pool.map(process_file, files)

    # Group files by hash
    hash_to_files = defaultdict(list)
    for file_hash, file_path in results:
        if file_hash:  # Only add files that were successfully hashed
            hash_to_files[file_hash].append(file_path)

    # Filter out unique files (only keep duplicates)
    duplicates = {hash_val: file_list for hash_val, file_list in hash_to_files.items()
                  if len(file_list) > 1}

    return duplicates

def print_duplicates(duplicates: Dict[str, List[Path]]) -> None:
    """
    Print the duplicate files in a formatted way.

    Args:
        duplicates: Dictionary mapping file hashes to lists of file paths
    """
    if not duplicates:
        print("No duplicates found.")
        return

    print(f"Found {len(duplicates)} sets of duplicate files:")
    for hash_val, file_list in duplicates.items():
        print(f"\nHash: {hash_val}")
        for file_path in file_list:
            print(f"  {file_path}")

def main():
    """Main function to run the duplicate file finder."""
    if len(sys.argv) < 2:
        print("Usage: python find_duplicates.py <root_directory> [num_processes]")
        sys.exit(1)

    root_dir = Path(sys.argv[1])
    if not root_dir.exists():
        print(f"Error: Directory {root_dir} does not exist")
        sys.exit(1)

    num_processes = None
    if len(sys.argv) > 2:
        try:
            num_processes = int(sys.argv[2])
        except ValueError:
            print("Error: num_processes must be an integer")
            sys.exit(1)

    print(f"Searching for duplicates in {root_dir} using {num_processes or cpu_count()} processes")

    duplicates = find_duplicates_parallel(root_dir, num_processes)
    print_duplicates(duplicates)

if __name__ == "__main__":
    main()