#!/usr/bin/env python3 """ LLM Performance Testing Framework (Simplified Version) """ import time import json import os from abc import ABC, abstractmethod from typing import Dict, List, Any, Tuple from dataclasses import dataclass from pathlib import Path @dataclass class TestResult: """Container for test results""" model_name: str task_name: str response_time: float tps: float cpu_usage: float memory_usage: float quality_score: float raw_output: str expected_output: str success: bool timestamp: float @dataclass class PerformanceMetrics: """Container for performance metrics""" response_time: float tps: float cpu_avg: float memory_avg: float class LLMInterface(ABC): """Abstract base class for LLM interfaces""" def __init__(self, model_name: str, config: Dict[str, Any]): self.model_name = model_name self.config = config @abstractmethod def generate(self, prompt: str, **kwargs) -> str: """Generate text from prompt""" pass @abstractmethod def get_model_info(self) -> Dict[str, Any]: """Get model information""" pass class TestSuite: """Main test suite class""" def __init__(self, output_dir: str = "test_results"): self.output_dir = Path(output_dir) self.output_dir.mkdir(exist_ok=True) self.results: List[TestResult] = [] self.models: List[LLMInterface] = [] def add_model(self, model: LLMInterface): """Add a model to test""" self.models.append(model) def run_single_test(self, model: LLMInterface, task: Dict[str, Any]) -> TestResult: """Run a single test on a model""" # Start timing start_time = time.time() # Simulate resource usage cpu_start = 50 # Percentage (simulated) memory_start = 1024 # MB (simulated) # Generate response try: response = model.generate(task["prompt"], **task.get("parameters", {})) success = True except Exception as e: response = f"Error: {str(e)}" success = False # Measure completion end_time = time.time() response_time = end_time - start_time # Calculate TPS (simulated) tps = 0 if success and response: # Estimate tokens (very rough approximation) tps = len(response.split()) / response_time if response_time > 0 else 0 # Simulate end resources cpu_end = 60 # Percentage (simulated) memory_end = 1536 # MB (simulated) # Calculate averages cpu_avg = (cpu_start + cpu_end) / 2 memory_avg = (memory_start + memory_end) / 2 # Quality score - this is a placeholder quality_score = self._calculate_quality_score(response, task.get("expected_output", "")) result = TestResult( model_name=model.model_name, task_name=task["name"], response_time=response_time, tps=tps, cpu_usage=cpu_avg, memory_usage=memory_avg, quality_score=quality_score, raw_output=response, expected_output=task.get("expected_output", ""), success=success, timestamp=time.time() ) self.results.append(result) return result def run_all_tests(self, tasks: List[Dict[str, Any]]): """Run all tests across all models""" for model in self.models: print(f"Running tests on {model.model_name}") for task in tasks: result = self.run_single_test(model, task) print(f" {task['name']}: {result.response_time:.2f}s, {result.quality_score:.2f}/100") def _calculate_quality_score(self, response: str, expected: str) -> float: """Calculate quality score based on response""" # This is a placeholder implementation # In practice, this could use static analysis, code execution, etc. if not response: return 0.0 # Very basic quality scoring score = 50 # Base score # Add points for presence of expected patterns if expected and expected.lower() in response.lower(): score += 20 # Add points for valid syntax (simplified) if not response.startswith("Error:") and len(response) > 10: score += 10 # Cap score at 100 return min(score, 100.0) def save_results(self, filename: str = None): """Save test results to JSON file""" if filename is None: filename = f"results_{int(time.time())}.json" results_data = [] for result in self.results: results_data.append({ "model_name": result.model_name, "task_name": result.task_name, "response_time": result.response_time, "tps": result.tps, "cpu_usage": result.cpu_usage, "memory_usage": result.memory_usage, "quality_score": result.quality_score, "raw_output": result.raw_output, "expected_output": result.expected_output, "success": result.success, "timestamp": result.timestamp }) with open(self.output_dir / filename, 'w') as f: json.dump(results_data, f, indent=2) print(f"Results saved to {self.output_dir / filename}") # Example model implementations class MockLLM(LLMInterface): """Mock LLM for testing purposes""" def generate(self, prompt: str, **kwargs) -> str: # Simulate generation time time.sleep(0.1) return f"Mock response for: {prompt}" def get_model_info(self) -> Dict[str, Any]: return { "name": self.model_name, "type": "mock", "version": "1.0" } class ExampleTask: """Example task class for testing""" @staticmethod def get_sample_tasks() -> List[Dict[str, Any]]: return [ { "name": "Simple function", "prompt": "Write a Python function to add two numbers", "expected_output": "return a + b", "parameters": {} }, { "name": "Loop example", "prompt": "Write a Python loop that prints numbers 1 to 10", "expected_output": "for i in range(1, 11)", "parameters": {} } ] if __name__ == "__main__": # Create test suite suite = TestSuite() # Add mock models suite.add_model(MockLLM("mock1", {})) suite.add_model(MockLLM("mock2", {})) # Run sample tests tasks = ExampleTask.get_sample_tasks() suite.run_all_tests(tasks) # Save results suite.save_results() print("Testing completed!")