228 lines
7.0 KiB
Python
228 lines
7.0 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
LLM Performance Testing Framework (Simplified Version)
|
||
|
|
"""
|
||
|
|
|
||
|
|
import time
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
from abc import ABC, abstractmethod
|
||
|
|
from typing import Dict, List, Any, Tuple
|
||
|
|
from dataclasses import dataclass
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class TestResult:
|
||
|
|
"""Container for test results"""
|
||
|
|
model_name: str
|
||
|
|
task_name: str
|
||
|
|
response_time: float
|
||
|
|
tps: float
|
||
|
|
cpu_usage: float
|
||
|
|
memory_usage: float
|
||
|
|
quality_score: float
|
||
|
|
raw_output: str
|
||
|
|
expected_output: str
|
||
|
|
success: bool
|
||
|
|
timestamp: float
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class PerformanceMetrics:
|
||
|
|
"""Container for performance metrics"""
|
||
|
|
response_time: float
|
||
|
|
tps: float
|
||
|
|
cpu_avg: float
|
||
|
|
memory_avg: float
|
||
|
|
|
||
|
|
class LLMInterface(ABC):
|
||
|
|
"""Abstract base class for LLM interfaces"""
|
||
|
|
|
||
|
|
def __init__(self, model_name: str, config: Dict[str, Any]):
|
||
|
|
self.model_name = model_name
|
||
|
|
self.config = config
|
||
|
|
|
||
|
|
@abstractmethod
|
||
|
|
def generate(self, prompt: str, **kwargs) -> str:
|
||
|
|
"""Generate text from prompt"""
|
||
|
|
pass
|
||
|
|
|
||
|
|
@abstractmethod
|
||
|
|
def get_model_info(self) -> Dict[str, Any]:
|
||
|
|
"""Get model information"""
|
||
|
|
pass
|
||
|
|
|
||
|
|
class TestSuite:
|
||
|
|
"""Main test suite class"""
|
||
|
|
|
||
|
|
def __init__(self, output_dir: str = "test_results"):
|
||
|
|
self.output_dir = Path(output_dir)
|
||
|
|
self.output_dir.mkdir(exist_ok=True)
|
||
|
|
self.results: List[TestResult] = []
|
||
|
|
self.models: List[LLMInterface] = []
|
||
|
|
|
||
|
|
def add_model(self, model: LLMInterface):
|
||
|
|
"""Add a model to test"""
|
||
|
|
self.models.append(model)
|
||
|
|
|
||
|
|
def run_single_test(self, model: LLMInterface, task: Dict[str, Any]) -> TestResult:
|
||
|
|
"""Run a single test on a model"""
|
||
|
|
# Start timing
|
||
|
|
start_time = time.time()
|
||
|
|
|
||
|
|
# Simulate resource usage
|
||
|
|
cpu_start = 50 # Percentage (simulated)
|
||
|
|
memory_start = 1024 # MB (simulated)
|
||
|
|
|
||
|
|
# Generate response
|
||
|
|
try:
|
||
|
|
response = model.generate(task["prompt"], **task.get("parameters", {}))
|
||
|
|
success = True
|
||
|
|
except Exception as e:
|
||
|
|
response = f"Error: {str(e)}"
|
||
|
|
success = False
|
||
|
|
|
||
|
|
# Measure completion
|
||
|
|
end_time = time.time()
|
||
|
|
response_time = end_time - start_time
|
||
|
|
|
||
|
|
# Calculate TPS (simulated)
|
||
|
|
tps = 0
|
||
|
|
if success and response:
|
||
|
|
# Estimate tokens (very rough approximation)
|
||
|
|
tps = len(response.split()) / response_time if response_time > 0 else 0
|
||
|
|
|
||
|
|
# Simulate end resources
|
||
|
|
cpu_end = 60 # Percentage (simulated)
|
||
|
|
memory_end = 1536 # MB (simulated)
|
||
|
|
|
||
|
|
# Calculate averages
|
||
|
|
cpu_avg = (cpu_start + cpu_end) / 2
|
||
|
|
memory_avg = (memory_start + memory_end) / 2
|
||
|
|
|
||
|
|
# Quality score - this is a placeholder
|
||
|
|
quality_score = self._calculate_quality_score(response, task.get("expected_output", ""))
|
||
|
|
|
||
|
|
result = TestResult(
|
||
|
|
model_name=model.model_name,
|
||
|
|
task_name=task["name"],
|
||
|
|
response_time=response_time,
|
||
|
|
tps=tps,
|
||
|
|
cpu_usage=cpu_avg,
|
||
|
|
memory_usage=memory_avg,
|
||
|
|
quality_score=quality_score,
|
||
|
|
raw_output=response,
|
||
|
|
expected_output=task.get("expected_output", ""),
|
||
|
|
success=success,
|
||
|
|
timestamp=time.time()
|
||
|
|
)
|
||
|
|
|
||
|
|
self.results.append(result)
|
||
|
|
return result
|
||
|
|
|
||
|
|
def run_all_tests(self, tasks: List[Dict[str, Any]]):
|
||
|
|
"""Run all tests across all models"""
|
||
|
|
for model in self.models:
|
||
|
|
print(f"Running tests on {model.model_name}")
|
||
|
|
for task in tasks:
|
||
|
|
result = self.run_single_test(model, task)
|
||
|
|
print(f" {task['name']}: {result.response_time:.2f}s, {result.quality_score:.2f}/100")
|
||
|
|
|
||
|
|
def _calculate_quality_score(self, response: str, expected: str) -> float:
|
||
|
|
"""Calculate quality score based on response"""
|
||
|
|
# This is a placeholder implementation
|
||
|
|
# In practice, this could use static analysis, code execution, etc.
|
||
|
|
if not response:
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
# Very basic quality scoring
|
||
|
|
score = 50 # Base score
|
||
|
|
|
||
|
|
# Add points for presence of expected patterns
|
||
|
|
if expected and expected.lower() in response.lower():
|
||
|
|
score += 20
|
||
|
|
|
||
|
|
# Add points for valid syntax (simplified)
|
||
|
|
if not response.startswith("Error:") and len(response) > 10:
|
||
|
|
score += 10
|
||
|
|
|
||
|
|
# Cap score at 100
|
||
|
|
return min(score, 100.0)
|
||
|
|
|
||
|
|
def save_results(self, filename: str = None):
|
||
|
|
"""Save test results to JSON file"""
|
||
|
|
if filename is None:
|
||
|
|
filename = f"results_{int(time.time())}.json"
|
||
|
|
|
||
|
|
results_data = []
|
||
|
|
for result in self.results:
|
||
|
|
results_data.append({
|
||
|
|
"model_name": result.model_name,
|
||
|
|
"task_name": result.task_name,
|
||
|
|
"response_time": result.response_time,
|
||
|
|
"tps": result.tps,
|
||
|
|
"cpu_usage": result.cpu_usage,
|
||
|
|
"memory_usage": result.memory_usage,
|
||
|
|
"quality_score": result.quality_score,
|
||
|
|
"raw_output": result.raw_output,
|
||
|
|
"expected_output": result.expected_output,
|
||
|
|
"success": result.success,
|
||
|
|
"timestamp": result.timestamp
|
||
|
|
})
|
||
|
|
|
||
|
|
with open(self.output_dir / filename, 'w') as f:
|
||
|
|
json.dump(results_data, f, indent=2)
|
||
|
|
|
||
|
|
print(f"Results saved to {self.output_dir / filename}")
|
||
|
|
|
||
|
|
# Example model implementations
|
||
|
|
class MockLLM(LLMInterface):
|
||
|
|
"""Mock LLM for testing purposes"""
|
||
|
|
|
||
|
|
def generate(self, prompt: str, **kwargs) -> str:
|
||
|
|
# Simulate generation time
|
||
|
|
time.sleep(0.1)
|
||
|
|
return f"Mock response for: {prompt}"
|
||
|
|
|
||
|
|
def get_model_info(self) -> Dict[str, Any]:
|
||
|
|
return {
|
||
|
|
"name": self.model_name,
|
||
|
|
"type": "mock",
|
||
|
|
"version": "1.0"
|
||
|
|
}
|
||
|
|
|
||
|
|
class ExampleTask:
|
||
|
|
"""Example task class for testing"""
|
||
|
|
|
||
|
|
@staticmethod
|
||
|
|
def get_sample_tasks() -> List[Dict[str, Any]]:
|
||
|
|
return [
|
||
|
|
{
|
||
|
|
"name": "Simple function",
|
||
|
|
"prompt": "Write a Python function to add two numbers",
|
||
|
|
"expected_output": "return a + b",
|
||
|
|
"parameters": {}
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"name": "Loop example",
|
||
|
|
"prompt": "Write a Python loop that prints numbers 1 to 10",
|
||
|
|
"expected_output": "for i in range(1, 11)",
|
||
|
|
"parameters": {}
|
||
|
|
}
|
||
|
|
]
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
# Create test suite
|
||
|
|
suite = TestSuite()
|
||
|
|
|
||
|
|
# Add mock models
|
||
|
|
suite.add_model(MockLLM("mock1", {}))
|
||
|
|
suite.add_model(MockLLM("mock2", {}))
|
||
|
|
|
||
|
|
# Run sample tests
|
||
|
|
tasks = ExampleTask.get_sample_tasks()
|
||
|
|
suite.run_all_tests(tasks)
|
||
|
|
|
||
|
|
# Save results
|
||
|
|
suite.save_results()
|
||
|
|
|
||
|
|
print("Testing completed!")
|