Files
localgenai/testing/qwen3-coder-30b/llm_test_framework_simple.py

228 lines
7.0 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
LLM Performance Testing Framework (Simplified Version)
"""
import time
import json
import os
from abc import ABC, abstractmethod
from typing import Dict, List, Any, Tuple
from dataclasses import dataclass
from pathlib import Path
@dataclass
class TestResult:
"""Container for test results"""
model_name: str
task_name: str
response_time: float
tps: float
cpu_usage: float
memory_usage: float
quality_score: float
raw_output: str
expected_output: str
success: bool
timestamp: float
@dataclass
class PerformanceMetrics:
"""Container for performance metrics"""
response_time: float
tps: float
cpu_avg: float
memory_avg: float
class LLMInterface(ABC):
"""Abstract base class for LLM interfaces"""
def __init__(self, model_name: str, config: Dict[str, Any]):
self.model_name = model_name
self.config = config
@abstractmethod
def generate(self, prompt: str, **kwargs) -> str:
"""Generate text from prompt"""
pass
@abstractmethod
def get_model_info(self) -> Dict[str, Any]:
"""Get model information"""
pass
class TestSuite:
"""Main test suite class"""
def __init__(self, output_dir: str = "test_results"):
self.output_dir = Path(output_dir)
self.output_dir.mkdir(exist_ok=True)
self.results: List[TestResult] = []
self.models: List[LLMInterface] = []
def add_model(self, model: LLMInterface):
"""Add a model to test"""
self.models.append(model)
def run_single_test(self, model: LLMInterface, task: Dict[str, Any]) -> TestResult:
"""Run a single test on a model"""
# Start timing
start_time = time.time()
# Simulate resource usage
cpu_start = 50 # Percentage (simulated)
memory_start = 1024 # MB (simulated)
# Generate response
try:
response = model.generate(task["prompt"], **task.get("parameters", {}))
success = True
except Exception as e:
response = f"Error: {str(e)}"
success = False
# Measure completion
end_time = time.time()
response_time = end_time - start_time
# Calculate TPS (simulated)
tps = 0
if success and response:
# Estimate tokens (very rough approximation)
tps = len(response.split()) / response_time if response_time > 0 else 0
# Simulate end resources
cpu_end = 60 # Percentage (simulated)
memory_end = 1536 # MB (simulated)
# Calculate averages
cpu_avg = (cpu_start + cpu_end) / 2
memory_avg = (memory_start + memory_end) / 2
# Quality score - this is a placeholder
quality_score = self._calculate_quality_score(response, task.get("expected_output", ""))
result = TestResult(
model_name=model.model_name,
task_name=task["name"],
response_time=response_time,
tps=tps,
cpu_usage=cpu_avg,
memory_usage=memory_avg,
quality_score=quality_score,
raw_output=response,
expected_output=task.get("expected_output", ""),
success=success,
timestamp=time.time()
)
self.results.append(result)
return result
def run_all_tests(self, tasks: List[Dict[str, Any]]):
"""Run all tests across all models"""
for model in self.models:
print(f"Running tests on {model.model_name}")
for task in tasks:
result = self.run_single_test(model, task)
print(f" {task['name']}: {result.response_time:.2f}s, {result.quality_score:.2f}/100")
def _calculate_quality_score(self, response: str, expected: str) -> float:
"""Calculate quality score based on response"""
# This is a placeholder implementation
# In practice, this could use static analysis, code execution, etc.
if not response:
return 0.0
# Very basic quality scoring
score = 50 # Base score
# Add points for presence of expected patterns
if expected and expected.lower() in response.lower():
score += 20
# Add points for valid syntax (simplified)
if not response.startswith("Error:") and len(response) > 10:
score += 10
# Cap score at 100
return min(score, 100.0)
def save_results(self, filename: str = None):
"""Save test results to JSON file"""
if filename is None:
filename = f"results_{int(time.time())}.json"
results_data = []
for result in self.results:
results_data.append({
"model_name": result.model_name,
"task_name": result.task_name,
"response_time": result.response_time,
"tps": result.tps,
"cpu_usage": result.cpu_usage,
"memory_usage": result.memory_usage,
"quality_score": result.quality_score,
"raw_output": result.raw_output,
"expected_output": result.expected_output,
"success": result.success,
"timestamp": result.timestamp
})
with open(self.output_dir / filename, 'w') as f:
json.dump(results_data, f, indent=2)
print(f"Results saved to {self.output_dir / filename}")
# Example model implementations
class MockLLM(LLMInterface):
"""Mock LLM for testing purposes"""
def generate(self, prompt: str, **kwargs) -> str:
# Simulate generation time
time.sleep(0.1)
return f"Mock response for: {prompt}"
def get_model_info(self) -> Dict[str, Any]:
return {
"name": self.model_name,
"type": "mock",
"version": "1.0"
}
class ExampleTask:
"""Example task class for testing"""
@staticmethod
def get_sample_tasks() -> List[Dict[str, Any]]:
return [
{
"name": "Simple function",
"prompt": "Write a Python function to add two numbers",
"expected_output": "return a + b",
"parameters": {}
},
{
"name": "Loop example",
"prompt": "Write a Python loop that prints numbers 1 to 10",
"expected_output": "for i in range(1, 11)",
"parameters": {}
}
]
if __name__ == "__main__":
# Create test suite
suite = TestSuite()
# Add mock models
suite.add_model(MockLLM("mock1", {}))
suite.add_model(MockLLM("mock2", {}))
# Run sample tests
tasks = ExampleTask.get_sample_tasks()
suite.run_all_tests(tasks)
# Save results
suite.save_results()
print("Testing completed!")