Initial commit: localgenai stack
Containerized local LLM stack for the Framework Desktop / Strix Halo,
plus the OpenCode harness on the Mac side.
- pyinfra/framework/: pyinfra deploy targeting the box
- llama.cpp (Vulkan), vLLM (ROCm), Ollama (ROCm with HSA override
for gfx1151), OpenWebUI
- Beszel (host + container + AMD GPU dashboard via sysfs)
- OpenLIT (LLM fleet metrics)
- Phoenix (per-trace agent waterfall)
- OpenHands (autonomous agent in a Docker sandbox)
- opencode/: OpenCode config + Phoenix bridge plugin (OTel exporter)
- install.sh deploys to ~/.config/opencode/
- StrixHaloSetup.md / StrixHaloMemory.md / Roadmap.md / TODO.md:
documentation and planning
- testing/qwen3-coder-30b/: small evaluation harness
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
228
testing/qwen3-coder-30b/llm_test_framework_simple.py
Executable file
228
testing/qwen3-coder-30b/llm_test_framework_simple.py
Executable file
@@ -0,0 +1,228 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
LLM Performance Testing Framework (Simplified Version)
|
||||
"""
|
||||
|
||||
import time
|
||||
import json
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List, Any, Tuple
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
@dataclass
|
||||
class TestResult:
|
||||
"""Container for test results"""
|
||||
model_name: str
|
||||
task_name: str
|
||||
response_time: float
|
||||
tps: float
|
||||
cpu_usage: float
|
||||
memory_usage: float
|
||||
quality_score: float
|
||||
raw_output: str
|
||||
expected_output: str
|
||||
success: bool
|
||||
timestamp: float
|
||||
|
||||
@dataclass
|
||||
class PerformanceMetrics:
|
||||
"""Container for performance metrics"""
|
||||
response_time: float
|
||||
tps: float
|
||||
cpu_avg: float
|
||||
memory_avg: float
|
||||
|
||||
class LLMInterface(ABC):
|
||||
"""Abstract base class for LLM interfaces"""
|
||||
|
||||
def __init__(self, model_name: str, config: Dict[str, Any]):
|
||||
self.model_name = model_name
|
||||
self.config = config
|
||||
|
||||
@abstractmethod
|
||||
def generate(self, prompt: str, **kwargs) -> str:
|
||||
"""Generate text from prompt"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_model_info(self) -> Dict[str, Any]:
|
||||
"""Get model information"""
|
||||
pass
|
||||
|
||||
class TestSuite:
|
||||
"""Main test suite class"""
|
||||
|
||||
def __init__(self, output_dir: str = "test_results"):
|
||||
self.output_dir = Path(output_dir)
|
||||
self.output_dir.mkdir(exist_ok=True)
|
||||
self.results: List[TestResult] = []
|
||||
self.models: List[LLMInterface] = []
|
||||
|
||||
def add_model(self, model: LLMInterface):
|
||||
"""Add a model to test"""
|
||||
self.models.append(model)
|
||||
|
||||
def run_single_test(self, model: LLMInterface, task: Dict[str, Any]) -> TestResult:
|
||||
"""Run a single test on a model"""
|
||||
# Start timing
|
||||
start_time = time.time()
|
||||
|
||||
# Simulate resource usage
|
||||
cpu_start = 50 # Percentage (simulated)
|
||||
memory_start = 1024 # MB (simulated)
|
||||
|
||||
# Generate response
|
||||
try:
|
||||
response = model.generate(task["prompt"], **task.get("parameters", {}))
|
||||
success = True
|
||||
except Exception as e:
|
||||
response = f"Error: {str(e)}"
|
||||
success = False
|
||||
|
||||
# Measure completion
|
||||
end_time = time.time()
|
||||
response_time = end_time - start_time
|
||||
|
||||
# Calculate TPS (simulated)
|
||||
tps = 0
|
||||
if success and response:
|
||||
# Estimate tokens (very rough approximation)
|
||||
tps = len(response.split()) / response_time if response_time > 0 else 0
|
||||
|
||||
# Simulate end resources
|
||||
cpu_end = 60 # Percentage (simulated)
|
||||
memory_end = 1536 # MB (simulated)
|
||||
|
||||
# Calculate averages
|
||||
cpu_avg = (cpu_start + cpu_end) / 2
|
||||
memory_avg = (memory_start + memory_end) / 2
|
||||
|
||||
# Quality score - this is a placeholder
|
||||
quality_score = self._calculate_quality_score(response, task.get("expected_output", ""))
|
||||
|
||||
result = TestResult(
|
||||
model_name=model.model_name,
|
||||
task_name=task["name"],
|
||||
response_time=response_time,
|
||||
tps=tps,
|
||||
cpu_usage=cpu_avg,
|
||||
memory_usage=memory_avg,
|
||||
quality_score=quality_score,
|
||||
raw_output=response,
|
||||
expected_output=task.get("expected_output", ""),
|
||||
success=success,
|
||||
timestamp=time.time()
|
||||
)
|
||||
|
||||
self.results.append(result)
|
||||
return result
|
||||
|
||||
def run_all_tests(self, tasks: List[Dict[str, Any]]):
|
||||
"""Run all tests across all models"""
|
||||
for model in self.models:
|
||||
print(f"Running tests on {model.model_name}")
|
||||
for task in tasks:
|
||||
result = self.run_single_test(model, task)
|
||||
print(f" {task['name']}: {result.response_time:.2f}s, {result.quality_score:.2f}/100")
|
||||
|
||||
def _calculate_quality_score(self, response: str, expected: str) -> float:
|
||||
"""Calculate quality score based on response"""
|
||||
# This is a placeholder implementation
|
||||
# In practice, this could use static analysis, code execution, etc.
|
||||
if not response:
|
||||
return 0.0
|
||||
|
||||
# Very basic quality scoring
|
||||
score = 50 # Base score
|
||||
|
||||
# Add points for presence of expected patterns
|
||||
if expected and expected.lower() in response.lower():
|
||||
score += 20
|
||||
|
||||
# Add points for valid syntax (simplified)
|
||||
if not response.startswith("Error:") and len(response) > 10:
|
||||
score += 10
|
||||
|
||||
# Cap score at 100
|
||||
return min(score, 100.0)
|
||||
|
||||
def save_results(self, filename: str = None):
|
||||
"""Save test results to JSON file"""
|
||||
if filename is None:
|
||||
filename = f"results_{int(time.time())}.json"
|
||||
|
||||
results_data = []
|
||||
for result in self.results:
|
||||
results_data.append({
|
||||
"model_name": result.model_name,
|
||||
"task_name": result.task_name,
|
||||
"response_time": result.response_time,
|
||||
"tps": result.tps,
|
||||
"cpu_usage": result.cpu_usage,
|
||||
"memory_usage": result.memory_usage,
|
||||
"quality_score": result.quality_score,
|
||||
"raw_output": result.raw_output,
|
||||
"expected_output": result.expected_output,
|
||||
"success": result.success,
|
||||
"timestamp": result.timestamp
|
||||
})
|
||||
|
||||
with open(self.output_dir / filename, 'w') as f:
|
||||
json.dump(results_data, f, indent=2)
|
||||
|
||||
print(f"Results saved to {self.output_dir / filename}")
|
||||
|
||||
# Example model implementations
|
||||
class MockLLM(LLMInterface):
|
||||
"""Mock LLM for testing purposes"""
|
||||
|
||||
def generate(self, prompt: str, **kwargs) -> str:
|
||||
# Simulate generation time
|
||||
time.sleep(0.1)
|
||||
return f"Mock response for: {prompt}"
|
||||
|
||||
def get_model_info(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"name": self.model_name,
|
||||
"type": "mock",
|
||||
"version": "1.0"
|
||||
}
|
||||
|
||||
class ExampleTask:
|
||||
"""Example task class for testing"""
|
||||
|
||||
@staticmethod
|
||||
def get_sample_tasks() -> List[Dict[str, Any]]:
|
||||
return [
|
||||
{
|
||||
"name": "Simple function",
|
||||
"prompt": "Write a Python function to add two numbers",
|
||||
"expected_output": "return a + b",
|
||||
"parameters": {}
|
||||
},
|
||||
{
|
||||
"name": "Loop example",
|
||||
"prompt": "Write a Python loop that prints numbers 1 to 10",
|
||||
"expected_output": "for i in range(1, 11)",
|
||||
"parameters": {}
|
||||
}
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Create test suite
|
||||
suite = TestSuite()
|
||||
|
||||
# Add mock models
|
||||
suite.add_model(MockLLM("mock1", {}))
|
||||
suite.add_model(MockLLM("mock2", {}))
|
||||
|
||||
# Run sample tests
|
||||
tasks = ExampleTask.get_sample_tasks()
|
||||
suite.run_all_tests(tasks)
|
||||
|
||||
# Save results
|
||||
suite.save_results()
|
||||
|
||||
print("Testing completed!")
|
||||
Reference in New Issue
Block a user