129 lines
3.9 KiB
Python
129 lines
3.9 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Find duplicate files in a directory tree by content hash, with parallelization support.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import hashlib
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
from collections import defaultdict
|
||
|
|
from pathlib import Path
|
||
|
|
from multiprocessing import Pool, cpu_count
|
||
|
|
from typing import Dict, List, Tuple
|
||
|
|
|
||
|
|
def get_file_hash(filepath: Path, chunk_size: int = 8192) -> str:
|
||
|
|
"""
|
||
|
|
Calculate the SHA256 hash of a file.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
filepath: Path to the file
|
||
|
|
chunk_size: Size of chunks to read at a time
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
SHA256 hash of the file as a hexadecimal string
|
||
|
|
"""
|
||
|
|
hash_sha256 = hashlib.sha256()
|
||
|
|
try:
|
||
|
|
with open(filepath, 'rb') as f:
|
||
|
|
# Read file in chunks to handle large files efficiently
|
||
|
|
for chunk in iter(lambda: f.read(chunk_size), b""):
|
||
|
|
hash_sha256.update(chunk)
|
||
|
|
return hash_sha256.hexdigest()
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error reading {filepath}: {e}")
|
||
|
|
return ""
|
||
|
|
|
||
|
|
def process_file(filepath: Path) -> Tuple[str, Path]:
|
||
|
|
"""
|
||
|
|
Process a single file and return its hash and path.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
filepath: Path to the file
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Tuple of (file_hash, file_path)
|
||
|
|
"""
|
||
|
|
file_hash = get_file_hash(filepath)
|
||
|
|
return (file_hash, filepath)
|
||
|
|
|
||
|
|
def find_duplicates_parallel(root_dir: Path, num_processes: int = None) -> Dict[str, List[Path]]:
|
||
|
|
"""
|
||
|
|
Find duplicate files in a directory tree by content hash using parallel processing.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
root_dir: Root directory to search
|
||
|
|
num_processes: Number of processes to use (default: CPU count)
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Dictionary mapping file hashes to lists of file paths with that hash
|
||
|
|
"""
|
||
|
|
if num_processes is None:
|
||
|
|
num_processes = cpu_count()
|
||
|
|
|
||
|
|
# Get all files in the directory tree
|
||
|
|
files = []
|
||
|
|
for file_path in root_dir.rglob('*'):
|
||
|
|
if file_path.is_file():
|
||
|
|
files.append(file_path)
|
||
|
|
|
||
|
|
print(f"Found {len(files)} files to process")
|
||
|
|
|
||
|
|
# Process files in parallel
|
||
|
|
with Pool(processes=num_processes) as pool:
|
||
|
|
results = pool.map(process_file, files)
|
||
|
|
|
||
|
|
# Group files by hash
|
||
|
|
hash_to_files = defaultdict(list)
|
||
|
|
for file_hash, file_path in results:
|
||
|
|
if file_hash: # Only add files that were successfully hashed
|
||
|
|
hash_to_files[file_hash].append(file_path)
|
||
|
|
|
||
|
|
# Filter out unique files (only keep duplicates)
|
||
|
|
duplicates = {hash_val: file_list for hash_val, file_list in hash_to_files.items()
|
||
|
|
if len(file_list) > 1}
|
||
|
|
|
||
|
|
return duplicates
|
||
|
|
|
||
|
|
def print_duplicates(duplicates: Dict[str, List[Path]]) -> None:
|
||
|
|
"""
|
||
|
|
Print the duplicate files in a formatted way.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
duplicates: Dictionary mapping file hashes to lists of file paths
|
||
|
|
"""
|
||
|
|
if not duplicates:
|
||
|
|
print("No duplicates found.")
|
||
|
|
return
|
||
|
|
|
||
|
|
print(f"Found {len(duplicates)} sets of duplicate files:")
|
||
|
|
for hash_val, file_list in duplicates.items():
|
||
|
|
print(f"\nHash: {hash_val}")
|
||
|
|
for file_path in file_list:
|
||
|
|
print(f" {file_path}")
|
||
|
|
|
||
|
|
def main():
|
||
|
|
"""Main function to run the duplicate file finder."""
|
||
|
|
if len(sys.argv) < 2:
|
||
|
|
print("Usage: python find_duplicates.py <root_directory> [num_processes]")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
root_dir = Path(sys.argv[1])
|
||
|
|
if not root_dir.exists():
|
||
|
|
print(f"Error: Directory {root_dir} does not exist")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
num_processes = None
|
||
|
|
if len(sys.argv) > 2:
|
||
|
|
try:
|
||
|
|
num_processes = int(sys.argv[2])
|
||
|
|
except ValueError:
|
||
|
|
print("Error: num_processes must be an integer")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
print(f"Searching for duplicates in {root_dir} using {num_processes or cpu_count()} processes")
|
||
|
|
|
||
|
|
duplicates = find_duplicates_parallel(root_dir, num_processes)
|
||
|
|
print_duplicates(duplicates)
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|