Containerized local LLM stack for the Framework Desktop / Strix Halo,
plus the OpenCode harness on the Mac side.
- pyinfra/framework/: pyinfra deploy targeting the box
- llama.cpp (Vulkan), vLLM (ROCm), Ollama (ROCm with HSA override
for gfx1151), OpenWebUI
- Beszel (host + container + AMD GPU dashboard via sysfs)
- OpenLIT (LLM fleet metrics)
- Phoenix (per-trace agent waterfall)
- OpenHands (autonomous agent in a Docker sandbox)
- opencode/: OpenCode config + Phoenix bridge plugin (OTel exporter)
- install.sh deploys to ~/.config/opencode/
- StrixHaloSetup.md / StrixHaloMemory.md / Roadmap.md / TODO.md:
documentation and planning
- testing/qwen3-coder-30b/: small evaluation harness
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
228 lines
7.0 KiB
Python
Executable File
228 lines
7.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
LLM Performance Testing Framework (Simplified Version)
|
|
"""
|
|
|
|
import time
|
|
import json
|
|
import os
|
|
from abc import ABC, abstractmethod
|
|
from typing import Dict, List, Any, Tuple
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
@dataclass
|
|
class TestResult:
|
|
"""Container for test results"""
|
|
model_name: str
|
|
task_name: str
|
|
response_time: float
|
|
tps: float
|
|
cpu_usage: float
|
|
memory_usage: float
|
|
quality_score: float
|
|
raw_output: str
|
|
expected_output: str
|
|
success: bool
|
|
timestamp: float
|
|
|
|
@dataclass
|
|
class PerformanceMetrics:
|
|
"""Container for performance metrics"""
|
|
response_time: float
|
|
tps: float
|
|
cpu_avg: float
|
|
memory_avg: float
|
|
|
|
class LLMInterface(ABC):
|
|
"""Abstract base class for LLM interfaces"""
|
|
|
|
def __init__(self, model_name: str, config: Dict[str, Any]):
|
|
self.model_name = model_name
|
|
self.config = config
|
|
|
|
@abstractmethod
|
|
def generate(self, prompt: str, **kwargs) -> str:
|
|
"""Generate text from prompt"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def get_model_info(self) -> Dict[str, Any]:
|
|
"""Get model information"""
|
|
pass
|
|
|
|
class TestSuite:
|
|
"""Main test suite class"""
|
|
|
|
def __init__(self, output_dir: str = "test_results"):
|
|
self.output_dir = Path(output_dir)
|
|
self.output_dir.mkdir(exist_ok=True)
|
|
self.results: List[TestResult] = []
|
|
self.models: List[LLMInterface] = []
|
|
|
|
def add_model(self, model: LLMInterface):
|
|
"""Add a model to test"""
|
|
self.models.append(model)
|
|
|
|
def run_single_test(self, model: LLMInterface, task: Dict[str, Any]) -> TestResult:
|
|
"""Run a single test on a model"""
|
|
# Start timing
|
|
start_time = time.time()
|
|
|
|
# Simulate resource usage
|
|
cpu_start = 50 # Percentage (simulated)
|
|
memory_start = 1024 # MB (simulated)
|
|
|
|
# Generate response
|
|
try:
|
|
response = model.generate(task["prompt"], **task.get("parameters", {}))
|
|
success = True
|
|
except Exception as e:
|
|
response = f"Error: {str(e)}"
|
|
success = False
|
|
|
|
# Measure completion
|
|
end_time = time.time()
|
|
response_time = end_time - start_time
|
|
|
|
# Calculate TPS (simulated)
|
|
tps = 0
|
|
if success and response:
|
|
# Estimate tokens (very rough approximation)
|
|
tps = len(response.split()) / response_time if response_time > 0 else 0
|
|
|
|
# Simulate end resources
|
|
cpu_end = 60 # Percentage (simulated)
|
|
memory_end = 1536 # MB (simulated)
|
|
|
|
# Calculate averages
|
|
cpu_avg = (cpu_start + cpu_end) / 2
|
|
memory_avg = (memory_start + memory_end) / 2
|
|
|
|
# Quality score - this is a placeholder
|
|
quality_score = self._calculate_quality_score(response, task.get("expected_output", ""))
|
|
|
|
result = TestResult(
|
|
model_name=model.model_name,
|
|
task_name=task["name"],
|
|
response_time=response_time,
|
|
tps=tps,
|
|
cpu_usage=cpu_avg,
|
|
memory_usage=memory_avg,
|
|
quality_score=quality_score,
|
|
raw_output=response,
|
|
expected_output=task.get("expected_output", ""),
|
|
success=success,
|
|
timestamp=time.time()
|
|
)
|
|
|
|
self.results.append(result)
|
|
return result
|
|
|
|
def run_all_tests(self, tasks: List[Dict[str, Any]]):
|
|
"""Run all tests across all models"""
|
|
for model in self.models:
|
|
print(f"Running tests on {model.model_name}")
|
|
for task in tasks:
|
|
result = self.run_single_test(model, task)
|
|
print(f" {task['name']}: {result.response_time:.2f}s, {result.quality_score:.2f}/100")
|
|
|
|
def _calculate_quality_score(self, response: str, expected: str) -> float:
|
|
"""Calculate quality score based on response"""
|
|
# This is a placeholder implementation
|
|
# In practice, this could use static analysis, code execution, etc.
|
|
if not response:
|
|
return 0.0
|
|
|
|
# Very basic quality scoring
|
|
score = 50 # Base score
|
|
|
|
# Add points for presence of expected patterns
|
|
if expected and expected.lower() in response.lower():
|
|
score += 20
|
|
|
|
# Add points for valid syntax (simplified)
|
|
if not response.startswith("Error:") and len(response) > 10:
|
|
score += 10
|
|
|
|
# Cap score at 100
|
|
return min(score, 100.0)
|
|
|
|
def save_results(self, filename: str = None):
|
|
"""Save test results to JSON file"""
|
|
if filename is None:
|
|
filename = f"results_{int(time.time())}.json"
|
|
|
|
results_data = []
|
|
for result in self.results:
|
|
results_data.append({
|
|
"model_name": result.model_name,
|
|
"task_name": result.task_name,
|
|
"response_time": result.response_time,
|
|
"tps": result.tps,
|
|
"cpu_usage": result.cpu_usage,
|
|
"memory_usage": result.memory_usage,
|
|
"quality_score": result.quality_score,
|
|
"raw_output": result.raw_output,
|
|
"expected_output": result.expected_output,
|
|
"success": result.success,
|
|
"timestamp": result.timestamp
|
|
})
|
|
|
|
with open(self.output_dir / filename, 'w') as f:
|
|
json.dump(results_data, f, indent=2)
|
|
|
|
print(f"Results saved to {self.output_dir / filename}")
|
|
|
|
# Example model implementations
|
|
class MockLLM(LLMInterface):
|
|
"""Mock LLM for testing purposes"""
|
|
|
|
def generate(self, prompt: str, **kwargs) -> str:
|
|
# Simulate generation time
|
|
time.sleep(0.1)
|
|
return f"Mock response for: {prompt}"
|
|
|
|
def get_model_info(self) -> Dict[str, Any]:
|
|
return {
|
|
"name": self.model_name,
|
|
"type": "mock",
|
|
"version": "1.0"
|
|
}
|
|
|
|
class ExampleTask:
|
|
"""Example task class for testing"""
|
|
|
|
@staticmethod
|
|
def get_sample_tasks() -> List[Dict[str, Any]]:
|
|
return [
|
|
{
|
|
"name": "Simple function",
|
|
"prompt": "Write a Python function to add two numbers",
|
|
"expected_output": "return a + b",
|
|
"parameters": {}
|
|
},
|
|
{
|
|
"name": "Loop example",
|
|
"prompt": "Write a Python loop that prints numbers 1 to 10",
|
|
"expected_output": "for i in range(1, 11)",
|
|
"parameters": {}
|
|
}
|
|
]
|
|
|
|
if __name__ == "__main__":
|
|
# Create test suite
|
|
suite = TestSuite()
|
|
|
|
# Add mock models
|
|
suite.add_model(MockLLM("mock1", {}))
|
|
suite.add_model(MockLLM("mock2", {}))
|
|
|
|
# Run sample tests
|
|
tasks = ExampleTask.get_sample_tasks()
|
|
suite.run_all_tests(tasks)
|
|
|
|
# Save results
|
|
suite.save_results()
|
|
|
|
print("Testing completed!") |