Files
localgenai/testing/qwen3-coder-30b/llm_test_framework.py
noisedestroyers 2c4bfefa95 Initial commit: localgenai stack
Containerized local LLM stack for the Framework Desktop / Strix Halo,
plus the OpenCode harness on the Mac side.

- pyinfra/framework/: pyinfra deploy targeting the box
  - llama.cpp (Vulkan), vLLM (ROCm), Ollama (ROCm with HSA override
    for gfx1151), OpenWebUI
  - Beszel (host + container + AMD GPU dashboard via sysfs)
  - OpenLIT (LLM fleet metrics)
  - Phoenix (per-trace agent waterfall)
  - OpenHands (autonomous agent in a Docker sandbox)
- opencode/: OpenCode config + Phoenix bridge plugin (OTel exporter)
  - install.sh deploys to ~/.config/opencode/
- StrixHaloSetup.md / StrixHaloMemory.md / Roadmap.md / TODO.md:
  documentation and planning
- testing/qwen3-coder-30b/: small evaluation harness

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 11:35:10 -04:00

246 lines
7.5 KiB
Python
Executable File

#!/usr/bin/env python3
"""
LLM Performance Testing Framework
"""
import time
import psutil
import json
import subprocess
import os
from abc import ABC, abstractmethod
from typing import Dict, List, Any, Tuple
from dataclasses import dataclass
from pathlib import Path
@dataclass
class TestResult:
"""Container for test results"""
model_name: str
task_name: str
response_time: float
tps: float
cpu_usage: float
memory_usage: float
gpu_usage: float
quality_score: float
raw_output: str
expected_output: str
success: bool
timestamp: float
@dataclass
class PerformanceMetrics:
"""Container for performance metrics"""
response_time: float
tps: float
cpu_avg: float
memory_avg: float
gpu_avg: float
class LLMInterface(ABC):
"""Abstract base class for LLM interfaces"""
def __init__(self, model_name: str, config: Dict[str, Any]):
self.model_name = model_name
self.config = config
@abstractmethod
def generate(self, prompt: str, **kwargs) -> str:
"""Generate text from prompt"""
pass
@abstractmethod
def get_model_info(self) -> Dict[str, Any]:
"""Get model information"""
pass
class TestSuite:
"""Main test suite class"""
def __init__(self, output_dir: str = "test_results"):
self.output_dir = Path(output_dir)
self.output_dir.mkdir(exist_ok=True)
self.results: List[TestResult] = []
self.models: List[LLMInterface] = []
def add_model(self, model: LLMInterface):
"""Add a model to test"""
self.models.append(model)
def run_single_test(self, model: LLMInterface, task: Dict[str, Any]) -> TestResult:
"""Run a single test on a model"""
# Start resource monitoring
start_time = time.time()
# Monitor resources
cpu_start = psutil.cpu_percent(interval=1)
memory_start = psutil.virtual_memory().used
gpu_start = self._get_gpu_usage()
# Generate response
try:
response = model.generate(task["prompt"], **task.get("parameters", {}))
success = True
except Exception as e:
response = f"Error: {str(e)}"
success = False
# Measure completion
end_time = time.time()
response_time = end_time - start_time
# Calculate TPS
tps = 0
if success and response:
# Estimate tokens (very rough approximation)
tps = len(response.split()) / response_time if response_time > 0 else 0
# Monitor end resources
cpu_end = psutil.cpu_percent(interval=1)
memory_end = psutil.virtual_memory().used
gpu_end = self._get_gpu_usage()
# Calculate averages
cpu_avg = (cpu_start + cpu_end) / 2
memory_avg = (memory_start + memory_end) / 2
gpu_avg = (gpu_start + gpu_end) / 2
# Quality score - this is a placeholder
quality_score = self._calculate_quality_score(response, task.get("expected_output", ""))
result = TestResult(
model_name=model.model_name,
task_name=task["name"],
response_time=response_time,
tps=tps,
cpu_usage=cpu_avg,
memory_usage=memory_avg,
gpu_usage=gpu_avg,
quality_score=quality_score,
raw_output=response,
expected_output=task.get("expected_output", ""),
success=success,
timestamp=time.time()
)
self.results.append(result)
return result
def run_all_tests(self, tasks: List[Dict[str, Any]]):
"""Run all tests across all models"""
for model in self.models:
print(f"Running tests on {model.model_name}")
for task in tasks:
result = self.run_single_test(model, task)
print(f" {task['name']}: {result.response_time:.2f}s, {result.quality_score:.2f}/100")
def _get_gpu_usage(self) -> float:
"""Get GPU usage if available"""
try:
# This is a placeholder - in real implementation,
# you'd use nvidia-smi or similar
return 0.0
except:
return 0.0
def _calculate_quality_score(self, response: str, expected: str) -> float:
"""Calculate quality score based on response"""
# This is a placeholder implementation
# In practice, this could use static analysis, code execution, etc.
if not response:
return 0.0
# Very basic quality scoring
score = 50 # Base score
# Add points for presence of expected patterns
if expected and expected.lower() in response.lower():
score += 20
# Add points for valid syntax (simplified)
if not response.startswith("Error:") and len(response) > 10:
score += 10
# Cap score at 100
return min(score, 100.0)
def save_results(self, filename: str = None):
"""Save test results to JSON file"""
if filename is None:
filename = f"results_{int(time.time())}.json"
results_data = []
for result in self.results:
results_data.append({
"model_name": result.model_name,
"task_name": result.task_name,
"response_time": result.response_time,
"tps": result.tps,
"cpu_usage": result.cpu_usage,
"memory_usage": result.memory_usage,
"gpu_usage": result.gpu_usage,
"quality_score": result.quality_score,
"raw_output": result.raw_output,
"expected_output": result.expected_output,
"success": result.success,
"timestamp": result.timestamp
})
with open(self.output_dir / filename, 'w') as f:
json.dump(results_data, f, indent=2)
print(f"Results saved to {self.output_dir / filename}")
# Example model implementations
class MockLLM(LLMInterface):
"""Mock LLM for testing purposes"""
def generate(self, prompt: str, **kwargs) -> str:
# Simulate generation time
time.sleep(0.5)
return f"Mock response for: {prompt}"
def get_model_info(self) -> Dict[str, Any]:
return {
"name": self.model_name,
"type": "mock",
"version": "1.0"
}
class ExampleTask:
"""Example task class for testing"""
@staticmethod
def get_sample_tasks() -> List[Dict[str, Any]]:
return [
{
"name": "Simple function",
"prompt": "Write a Python function to add two numbers",
"expected_output": "return a + b",
"parameters": {}
},
{
"name": "Loop example",
"prompt": "Write a Python loop that prints numbers 1 to 10",
"expected_output": "for i in range(1, 11)",
"parameters": {}
}
]
if __name__ == "__main__":
# Create test suite
suite = TestSuite()
# Add mock models
suite.add_model(MockLLM("mock1", {}))
suite.add_model(MockLLM("mock2", {}))
# Run sample tests
tasks = ExampleTask.get_sample_tasks()
suite.run_all_tests(tasks)
# Save results
suite.save_results()
print("Testing completed!")