#!/usr/bin/env python3 """ Find duplicate files in a directory tree by content hash, with parallelization support. """ import hashlib import os import sys from collections import defaultdict from pathlib import Path from multiprocessing import Pool, cpu_count from typing import Dict, List, Tuple def get_file_hash(filepath: Path, chunk_size: int = 8192) -> str: """ Calculate the SHA256 hash of a file. Args: filepath: Path to the file chunk_size: Size of chunks to read at a time Returns: SHA256 hash of the file as a hexadecimal string """ hash_sha256 = hashlib.sha256() try: with open(filepath, 'rb') as f: # Read file in chunks to handle large files efficiently for chunk in iter(lambda: f.read(chunk_size), b""): hash_sha256.update(chunk) return hash_sha256.hexdigest() except Exception as e: print(f"Error reading {filepath}: {e}") return "" def process_file(filepath: Path) -> Tuple[str, Path]: """ Process a single file and return its hash and path. Args: filepath: Path to the file Returns: Tuple of (file_hash, file_path) """ file_hash = get_file_hash(filepath) return (file_hash, filepath) def find_duplicates_parallel(root_dir: Path, num_processes: int = None) -> Dict[str, List[Path]]: """ Find duplicate files in a directory tree by content hash using parallel processing. Args: root_dir: Root directory to search num_processes: Number of processes to use (default: CPU count) Returns: Dictionary mapping file hashes to lists of file paths with that hash """ if num_processes is None: num_processes = cpu_count() # Get all files in the directory tree files = [] for file_path in root_dir.rglob('*'): if file_path.is_file(): files.append(file_path) print(f"Found {len(files)} files to process") # Process files in parallel with Pool(processes=num_processes) as pool: results = pool.map(process_file, files) # Group files by hash hash_to_files = defaultdict(list) for file_hash, file_path in results: if file_hash: # Only add files that were successfully hashed hash_to_files[file_hash].append(file_path) # Filter out unique files (only keep duplicates) duplicates = {hash_val: file_list for hash_val, file_list in hash_to_files.items() if len(file_list) > 1} return duplicates def print_duplicates(duplicates: Dict[str, List[Path]]) -> None: """ Print the duplicate files in a formatted way. Args: duplicates: Dictionary mapping file hashes to lists of file paths """ if not duplicates: print("No duplicates found.") return print(f"Found {len(duplicates)} sets of duplicate files:") for hash_val, file_list in duplicates.items(): print(f"\nHash: {hash_val}") for file_path in file_list: print(f" {file_path}") def main(): """Main function to run the duplicate file finder.""" if len(sys.argv) < 2: print("Usage: python find_duplicates.py [num_processes]") sys.exit(1) root_dir = Path(sys.argv[1]) if not root_dir.exists(): print(f"Error: Directory {root_dir} does not exist") sys.exit(1) num_processes = None if len(sys.argv) > 2: try: num_processes = int(sys.argv[2]) except ValueError: print("Error: num_processes must be an integer") sys.exit(1) print(f"Searching for duplicates in {root_dir} using {num_processes or cpu_count()} processes") duplicates = find_duplicates_parallel(root_dir, num_processes) print_duplicates(duplicates) if __name__ == "__main__": main()