Initial commit: localgenai stack

Containerized local LLM stack for the Framework Desktop / Strix Halo, plus the OpenCode harness on the Mac side. - pyinfra/framework/: pyinfra deploy targeting the box - llama.cpp (Vulkan), vLLM (ROCm), Ollama (ROCm with HSA override for gfx1151), OpenWebUI - Beszel (host + container + AMD GPU dashboard via sysfs) - OpenLIT (LLM fleet metrics) - Phoenix (per-trace agent waterfall) - OpenHands (autonomous agent in a Docker sandbox) - opencode/: OpenCode config + Phoenix bridge plugin (OTel exporter) - install.sh deploys to ~/.config/opencode/ - StrixHaloSetup.md / StrixHaloMemory.md / Roadmap.md / TODO.md: documentation and planning - testing/qwen3-coder-30b/: small evaluation harness Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 11:35:10 -04:00
commit 2c4bfefa95
36 changed files with 5265 additions and 0 deletions
--- a/testing/qwen3-coder-30b/llm_test_framework_simple.py
+++ b/testing/qwen3-coder-30b/llm_test_framework_simple.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+"""
+LLM Performance Testing Framework (Simplified Version)
+"""
+
+import time
+import json
+import os
+from abc import ABC, abstractmethod
+from typing import Dict, List, Any, Tuple
+from dataclasses import dataclass
+from pathlib import Path
+
+@dataclass
+class TestResult:
+    """Container for test results"""
+    model_name: str
+    task_name: str
+    response_time: float
+    tps: float
+    cpu_usage: float
+    memory_usage: float
+    quality_score: float
+    raw_output: str
+    expected_output: str
+    success: bool
+    timestamp: float
+
+@dataclass
+class PerformanceMetrics:
+    """Container for performance metrics"""
+    response_time: float
+    tps: float
+    cpu_avg: float
+    memory_avg: float
+
+class LLMInterface(ABC):
+    """Abstract base class for LLM interfaces"""
+    
+    def __init__(self, model_name: str, config: Dict[str, Any]):
+        self.model_name = model_name
+        self.config = config
+    
+    @abstractmethod
+    def generate(self, prompt: str, **kwargs) -> str:
+        """Generate text from prompt"""
+        pass
+    
+    @abstractmethod
+    def get_model_info(self) -> Dict[str, Any]:
+        """Get model information"""
+        pass
+
+class TestSuite:
+    """Main test suite class"""
+    
+    def __init__(self, output_dir: str = "test_results"):
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(exist_ok=True)
+        self.results: List[TestResult] = []
+        self.models: List[LLMInterface] = []
+    
+    def add_model(self, model: LLMInterface):
+        """Add a model to test"""
+        self.models.append(model)
+    
+    def run_single_test(self, model: LLMInterface, task: Dict[str, Any]) -> TestResult:
+        """Run a single test on a model"""
+        # Start timing
+        start_time = time.time()
+        
+        # Simulate resource usage
+        cpu_start = 50  # Percentage (simulated)
+        memory_start = 1024  # MB (simulated)
+        
+        # Generate response
+        try:
+            response = model.generate(task["prompt"], **task.get("parameters", {}))
+            success = True
+        except Exception as e:
+            response = f"Error: {str(e)}"
+            success = False
+        
+        # Measure completion
+        end_time = time.time()
+        response_time = end_time - start_time
+        
+        # Calculate TPS (simulated)
+        tps = 0
+        if success and response:
+            # Estimate tokens (very rough approximation)
+            tps = len(response.split()) / response_time if response_time > 0 else 0
+        
+        # Simulate end resources
+        cpu_end = 60  # Percentage (simulated)
+        memory_end = 1536  # MB (simulated)
+        
+        # Calculate averages
+        cpu_avg = (cpu_start + cpu_end) / 2
+        memory_avg = (memory_start + memory_end) / 2
+        
+        # Quality score - this is a placeholder
+        quality_score = self._calculate_quality_score(response, task.get("expected_output", ""))
+        
+        result = TestResult(
+            model_name=model.model_name,
+            task_name=task["name"],
+            response_time=response_time,
+            tps=tps,
+            cpu_usage=cpu_avg,
+            memory_usage=memory_avg,
+            quality_score=quality_score,
+            raw_output=response,
+            expected_output=task.get("expected_output", ""),
+            success=success,
+            timestamp=time.time()
+        )
+        
+        self.results.append(result)
+        return result
+    
+    def run_all_tests(self, tasks: List[Dict[str, Any]]):
+        """Run all tests across all models"""
+        for model in self.models:
+            print(f"Running tests on {model.model_name}")
+            for task in tasks:
+                result = self.run_single_test(model, task)
+                print(f"  {task['name']}: {result.response_time:.2f}s, {result.quality_score:.2f}/100")
+    
+    def _calculate_quality_score(self, response: str, expected: str) -> float:
+        """Calculate quality score based on response"""
+        # This is a placeholder implementation
+        # In practice, this could use static analysis, code execution, etc.
+        if not response:
+            return 0.0
+            
+        # Very basic quality scoring
+        score = 50  # Base score
+        
+        # Add points for presence of expected patterns
+        if expected and expected.lower() in response.lower():
+            score += 20
+            
+        # Add points for valid syntax (simplified)
+        if not response.startswith("Error:") and len(response) > 10:
+            score += 10
+            
+        # Cap score at 100
+        return min(score, 100.0)
+    
+    def save_results(self, filename: str = None):
+        """Save test results to JSON file"""
+        if filename is None:
+            filename = f"results_{int(time.time())}.json"
+        
+        results_data = []
+        for result in self.results:
+            results_data.append({
+                "model_name": result.model_name,
+                "task_name": result.task_name,
+                "response_time": result.response_time,
+                "tps": result.tps,
+                "cpu_usage": result.cpu_usage,
+                "memory_usage": result.memory_usage,
+                "quality_score": result.quality_score,
+                "raw_output": result.raw_output,
+                "expected_output": result.expected_output,
+                "success": result.success,
+                "timestamp": result.timestamp
+            })
+        
+        with open(self.output_dir / filename, 'w') as f:
+            json.dump(results_data, f, indent=2)
+        
+        print(f"Results saved to {self.output_dir / filename}")
+
+# Example model implementations
+class MockLLM(LLMInterface):
+    """Mock LLM for testing purposes"""
+    
+    def generate(self, prompt: str, **kwargs) -> str:
+        # Simulate generation time
+        time.sleep(0.1)
+        return f"Mock response for: {prompt}"
+    
+    def get_model_info(self) -> Dict[str, Any]:
+        return {
+            "name": self.model_name,
+            "type": "mock",
+            "version": "1.0"
+        }
+
+class ExampleTask:
+    """Example task class for testing"""
+    
+    @staticmethod
+    def get_sample_tasks() -> List[Dict[str, Any]]:
+        return [
+            {
+                "name": "Simple function",
+                "prompt": "Write a Python function to add two numbers",
+                "expected_output": "return a + b",
+                "parameters": {}
+            },
+            {
+                "name": "Loop example",
+                "prompt": "Write a Python loop that prints numbers 1 to 10",
+                "expected_output": "for i in range(1, 11)",
+                "parameters": {}
+            }
+        ]
+
+if __name__ == "__main__":
+    # Create test suite
+    suite = TestSuite()
+    
+    # Add mock models
+    suite.add_model(MockLLM("mock1", {}))
+    suite.add_model(MockLLM("mock2", {}))
+    
+    # Run sample tests
+    tasks = ExampleTask.get_sample_tasks()
+    suite.run_all_tests(tasks)
+    
+    # Save results
+    suite.save_results()
+    
+    print("Testing completed!")