testing/qwen3-coder-30b/find_duplicates.py

#!/usr/bin/env python3
"""
Find duplicate files in a directory tree by content hash, with parallelization support.
"""

import hashlib
import os
import sys
from collections import defaultdict
from pathlib import Path
from multiprocessing import Pool, cpu_count
from typing import Dict, List, Tuple

def get_file_hash(filepath: Path, chunk_size: int = 8192) -> str:
    """
    Calculate the SHA256 hash of a file.
    
    Args:
        filepath: Path to the file
        chunk_size: Size of chunks to read at a time
        
    Returns:
        SHA256 hash of the file as a hexadecimal string
    """
    hash_sha256 = hashlib.sha256()
    try:
        with open(filepath, 'rb') as f:
            # Read file in chunks to handle large files efficiently
            for chunk in iter(lambda: f.read(chunk_size), b""):
                hash_sha256.update(chunk)
        return hash_sha256.hexdigest()
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return ""

def process_file(filepath: Path) -> Tuple[str, Path]:
    """
    Process a single file and return its hash and path.
    
    Args:
        filepath: Path to the file
        
    Returns:
        Tuple of (file_hash, file_path)
    """
    file_hash = get_file_hash(filepath)
    return (file_hash, filepath)

def find_duplicates_parallel(root_dir: Path, num_processes: int = None) -> Dict[str, List[Path]]:
    """
    Find duplicate files in a directory tree by content hash using parallel processing.
    
    Args:
        root_dir: Root directory to search
        num_processes: Number of processes to use (default: CPU count)
        
    Returns:
        Dictionary mapping file hashes to lists of file paths with that hash
    """
    if num_processes is None:
        num_processes = cpu_count()
    
    # Get all files in the directory tree
    files = []
    for file_path in root_dir.rglob('*'):
        if file_path.is_file():
            files.append(file_path)
    
    print(f"Found {len(files)} files to process")
    
    # Process files in parallel
    with Pool(processes=num_processes) as pool:
        results = pool.map(process_file, files)
    
    # Group files by hash
    hash_to_files = defaultdict(list)
    for file_hash, file_path in results:
        if file_hash:  # Only add files that were successfully hashed
            hash_to_files[file_hash].append(file_path)
    
    # Filter out unique files (only keep duplicates)
    duplicates = {hash_val: file_list for hash_val, file_list in hash_to_files.items() 
                  if len(file_list) > 1}
    
    return duplicates

def print_duplicates(duplicates: Dict[str, List[Path]]) -> None:
    """
    Print the duplicate files in a formatted way.
    
    Args:
        duplicates: Dictionary mapping file hashes to lists of file paths
    """
    if not duplicates:
        print("No duplicates found.")
        return
    
    print(f"Found {len(duplicates)} sets of duplicate files:")
    for hash_val, file_list in duplicates.items():
        print(f"\nHash: {hash_val}")
        for file_path in file_list:
            print(f"  {file_path}")

def main():
    """Main function to run the duplicate file finder."""
    if len(sys.argv) < 2:
        print("Usage: python find_duplicates.py <root_directory> [num_processes]")
        sys.exit(1)
    
    root_dir = Path(sys.argv[1])
    if not root_dir.exists():
        print(f"Error: Directory {root_dir} does not exist")
        sys.exit(1)
    
    num_processes = None
    if len(sys.argv) > 2:
        try:
            num_processes = int(sys.argv[2])
        except ValueError:
            print("Error: num_processes must be an integer")
            sys.exit(1)
    
    print(f"Searching for duplicates in {root_dir} using {num_processes or cpu_count()} processes")
    
    duplicates = find_duplicates_parallel(root_dir, num_processes)
    print_duplicates(duplicates)

if __name__ == "__main__":
    main()
Initial commit: localgenai stack Containerized local LLM stack for the Framework Desktop / Strix Halo, plus the OpenCode harness on the Mac side. - pyinfra/framework/: pyinfra deploy targeting the box - llama.cpp (Vulkan), vLLM (ROCm), Ollama (ROCm with HSA override for gfx1151), OpenWebUI - Beszel (host + container + AMD GPU dashboard via sysfs) - OpenLIT (LLM fleet metrics) - Phoenix (per-trace agent waterfall) - OpenHands (autonomous agent in a Docker sandbox) - opencode/: OpenCode config + Phoenix bridge plugin (OTel exporter) - install.sh deploys to ~/.config/opencode/ - StrixHaloSetup.md / StrixHaloMemory.md / Roadmap.md / TODO.md: documentation and planning - testing/qwen3-coder-30b/: small evaluation harness Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-05-08 11:35:10 -04:00			`#!/usr/bin/env python3`
			`"""`
			`Find duplicate files in a directory tree by content hash, with parallelization support.`
			`"""`

			`import hashlib`
			`import os`
			`import sys`
			`from collections import defaultdict`
			`from pathlib import Path`
			`from multiprocessing import Pool, cpu_count`
			`from typing import Dict, List, Tuple`

			`def get_file_hash(filepath: Path, chunk_size: int = 8192) -> str:`
			`"""`
			`Calculate the SHA256 hash of a file.`

			`Args:`
			`filepath: Path to the file`
			`chunk_size: Size of chunks to read at a time`

			`Returns:`
			`SHA256 hash of the file as a hexadecimal string`
			`"""`
			`hash_sha256 = hashlib.sha256()`
			`try:`
			`with open(filepath, 'rb') as f:`
			`# Read file in chunks to handle large files efficiently`
			`for chunk in iter(lambda: f.read(chunk_size), b""):`
			`hash_sha256.update(chunk)`
			`return hash_sha256.hexdigest()`
			`except Exception as e:`
			`print(f"Error reading {filepath}: {e}")`
			`return ""`

			`def process_file(filepath: Path) -> Tuple[str, Path]:`
			`"""`
			`Process a single file and return its hash and path.`

			`Args:`
			`filepath: Path to the file`

			`Returns:`
			`Tuple of (file_hash, file_path)`
			`"""`
			`file_hash = get_file_hash(filepath)`
			`return (file_hash, filepath)`

			`def find_duplicates_parallel(root_dir: Path, num_processes: int = None) -> Dict[str, List[Path]]:`
			`"""`
			`Find duplicate files in a directory tree by content hash using parallel processing.`

			`Args:`
			`root_dir: Root directory to search`
			`num_processes: Number of processes to use (default: CPU count)`

			`Returns:`
			`Dictionary mapping file hashes to lists of file paths with that hash`
			`"""`
			`if num_processes is None:`
			`num_processes = cpu_count()`

			`# Get all files in the directory tree`
			`files = []`
			`for file_path in root_dir.rglob('*'):`
			`if file_path.is_file():`
			`files.append(file_path)`

			`print(f"Found {len(files)} files to process")`

			`# Process files in parallel`
			`with Pool(processes=num_processes) as pool:`
			`results = pool.map(process_file, files)`

			`# Group files by hash`
			`hash_to_files = defaultdict(list)`
			`for file_hash, file_path in results:`
			`if file_hash: # Only add files that were successfully hashed`
			`hash_to_files[file_hash].append(file_path)`

			`# Filter out unique files (only keep duplicates)`
			`duplicates = {hash_val: file_list for hash_val, file_list in hash_to_files.items()`
			`if len(file_list) > 1}`

			`return duplicates`

			`def print_duplicates(duplicates: Dict[str, List[Path]]) -> None:`
			`"""`
			`Print the duplicate files in a formatted way.`

			`Args:`
			`duplicates: Dictionary mapping file hashes to lists of file paths`
			`"""`
			`if not duplicates:`
			`print("No duplicates found.")`
			`return`

			`print(f"Found {len(duplicates)} sets of duplicate files:")`
			`for hash_val, file_list in duplicates.items():`
			`print(f"\nHash: {hash_val}")`
			`for file_path in file_list:`
			`print(f" {file_path}")`

			`def main():`
			`"""Main function to run the duplicate file finder."""`
			`if len(sys.argv) < 2:`
			`print("Usage: python find_duplicates.py <root_directory> [num_processes]")`
			`sys.exit(1)`

			`root_dir = Path(sys.argv[1])`
			`if not root_dir.exists():`
			`print(f"Error: Directory {root_dir} does not exist")`
			`sys.exit(1)`

			`num_processes = None`
			`if len(sys.argv) > 2:`
			`try:`
			`num_processes = int(sys.argv[2])`
			`except ValueError:`
			`print("Error: num_processes must be an integer")`
			`sys.exit(1)`

			`print(f"Searching for duplicates in {root_dir} using {num_processes or cpu_count()} processes")`

			`duplicates = find_duplicates_parallel(root_dir, num_processes)`
			`print_duplicates(duplicates)`

			`if __name__ == "__main__":`
			`main()`