Files
localgenai/testing/qwen3-coder-30b/find_duplicates.py
noisedestroyers 2c4bfefa95 Initial commit: localgenai stack
Containerized local LLM stack for the Framework Desktop / Strix Halo,
plus the OpenCode harness on the Mac side.

- pyinfra/framework/: pyinfra deploy targeting the box
  - llama.cpp (Vulkan), vLLM (ROCm), Ollama (ROCm with HSA override
    for gfx1151), OpenWebUI
  - Beszel (host + container + AMD GPU dashboard via sysfs)
  - OpenLIT (LLM fleet metrics)
  - Phoenix (per-trace agent waterfall)
  - OpenHands (autonomous agent in a Docker sandbox)
- opencode/: OpenCode config + Phoenix bridge plugin (OTel exporter)
  - install.sh deploys to ~/.config/opencode/
- StrixHaloSetup.md / StrixHaloMemory.md / Roadmap.md / TODO.md:
  documentation and planning
- testing/qwen3-coder-30b/: small evaluation harness

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 11:35:10 -04:00

129 lines
3.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Find duplicate files in a directory tree by content hash, with parallelization support.
"""
import hashlib
import os
import sys
from collections import defaultdict
from pathlib import Path
from multiprocessing import Pool, cpu_count
from typing import Dict, List, Tuple
def get_file_hash(filepath: Path, chunk_size: int = 8192) -> str:
"""
Calculate the SHA256 hash of a file.
Args:
filepath: Path to the file
chunk_size: Size of chunks to read at a time
Returns:
SHA256 hash of the file as a hexadecimal string
"""
hash_sha256 = hashlib.sha256()
try:
with open(filepath, 'rb') as f:
# Read file in chunks to handle large files efficiently
for chunk in iter(lambda: f.read(chunk_size), b""):
hash_sha256.update(chunk)
return hash_sha256.hexdigest()
except Exception as e:
print(f"Error reading {filepath}: {e}")
return ""
def process_file(filepath: Path) -> Tuple[str, Path]:
"""
Process a single file and return its hash and path.
Args:
filepath: Path to the file
Returns:
Tuple of (file_hash, file_path)
"""
file_hash = get_file_hash(filepath)
return (file_hash, filepath)
def find_duplicates_parallel(root_dir: Path, num_processes: int = None) -> Dict[str, List[Path]]:
"""
Find duplicate files in a directory tree by content hash using parallel processing.
Args:
root_dir: Root directory to search
num_processes: Number of processes to use (default: CPU count)
Returns:
Dictionary mapping file hashes to lists of file paths with that hash
"""
if num_processes is None:
num_processes = cpu_count()
# Get all files in the directory tree
files = []
for file_path in root_dir.rglob('*'):
if file_path.is_file():
files.append(file_path)
print(f"Found {len(files)} files to process")
# Process files in parallel
with Pool(processes=num_processes) as pool:
results = pool.map(process_file, files)
# Group files by hash
hash_to_files = defaultdict(list)
for file_hash, file_path in results:
if file_hash: # Only add files that were successfully hashed
hash_to_files[file_hash].append(file_path)
# Filter out unique files (only keep duplicates)
duplicates = {hash_val: file_list for hash_val, file_list in hash_to_files.items()
if len(file_list) > 1}
return duplicates
def print_duplicates(duplicates: Dict[str, List[Path]]) -> None:
"""
Print the duplicate files in a formatted way.
Args:
duplicates: Dictionary mapping file hashes to lists of file paths
"""
if not duplicates:
print("No duplicates found.")
return
print(f"Found {len(duplicates)} sets of duplicate files:")
for hash_val, file_list in duplicates.items():
print(f"\nHash: {hash_val}")
for file_path in file_list:
print(f" {file_path}")
def main():
"""Main function to run the duplicate file finder."""
if len(sys.argv) < 2:
print("Usage: python find_duplicates.py <root_directory> [num_processes]")
sys.exit(1)
root_dir = Path(sys.argv[1])
if not root_dir.exists():
print(f"Error: Directory {root_dir} does not exist")
sys.exit(1)
num_processes = None
if len(sys.argv) > 2:
try:
num_processes = int(sys.argv[2])
except ValueError:
print("Error: num_processes must be an integer")
sys.exit(1)
print(f"Searching for duplicates in {root_dir} using {num_processes or cpu_count()} processes")
duplicates = find_duplicates_parallel(root_dir, num_processes)
print_duplicates(duplicates)
if __name__ == "__main__":
main()