StreamLens/analyzer/reporting/flow_report.py

"""
Flow Analysis Report Generator
Generates comprehensive flow analysis reports with markup formatting
"""

import datetime
from typing import Dict, List, Optional
from pathlib import Path
from ..models import FlowStats, FrameTypeStats


class FlowReportGenerator:
    """Generate comprehensive flow analysis reports"""
    
    def __init__(self, analyzer):
        self.analyzer = analyzer
        
    def generate_report(self, output_path: Optional[str] = None, format_type: str = "markdown") -> str:
        """Generate comprehensive flow analysis report"""
        if format_type == "markdown":
            return self._generate_markdown_report(output_path)
        elif format_type == "html":
            return self._generate_html_report(output_path)
        else:
            return self._generate_text_report(output_path)
    
    def _generate_markdown_report(self, output_path: Optional[str] = None) -> str:
        """Generate markdown-formatted report"""
        flows = list(self.analyzer.flows.values())
        
        # Sort flows by importance (enhanced first, then by packet count)
        flows.sort(key=lambda x: (
            x.enhanced_analysis.decoder_type != "Standard",
            len(x.outlier_frames),
            x.frame_count
        ), reverse=True)
        
        report_lines = []
        
        # Header
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        report_lines.extend([
            "# StreamLens Flow Analysis Report",
            f"**Generated:** {timestamp}",
            f"**Total Flows:** {len(flows)}",
            f"**Analysis Engine:** {self.analyzer.__class__.__name__}",
            "",
            "---",
            ""
        ])
        
        # Executive Summary
        report_lines.extend(self._generate_executive_summary(flows))
        
        # Detailed Flow Analysis
        report_lines.extend([
            "## 📊 Detailed Flow Analysis",
            ""
        ])
        
        for i, flow in enumerate(flows, 1):
            report_lines.extend(self._generate_flow_section(flow, i))
        
        # Statistics Summary
        report_lines.extend(self._generate_statistics_summary(flows))
        
        report_content = "\n".join(report_lines)
        
        # Save to file if path provided
        if output_path:
            output_file = Path(output_path)
            output_file.write_text(report_content, encoding='utf-8')
            
        return report_content
    
    def _generate_executive_summary(self, flows: List[FlowStats]) -> List[str]:
        """Generate executive summary section"""
        total_packets = sum(flow.frame_count for flow in flows)
        total_bytes = sum(flow.total_bytes for flow in flows)
        enhanced_flows = [f for f in flows if f.enhanced_analysis.decoder_type != "Standard"]
        high_outlier_flows = [f for f in flows if len(f.outlier_frames) > f.frame_count * 0.1]
        
        return [
            "## 📋 Executive Summary",
            "",
            f"- **Total Network Flows:** {len(flows)}",
            f"- **Total Packets Analyzed:** {total_packets:,}",
            f"- **Total Data Volume:** {self._format_bytes(total_bytes)}",
            f"- **Enhanced Protocol Flows:** {len(enhanced_flows)} ({len(enhanced_flows)/len(flows)*100:.1f}%)",
            f"- **Flows with Timing Issues:** {len(high_outlier_flows)} ({len(high_outlier_flows)/len(flows)*100:.1f}%)",
            "",
            "### 🎯 Key Findings",
            ""
        ]
    
    def _generate_flow_section(self, flow: FlowStats, flow_num: int) -> List[str]:
        """Generate detailed section for a single flow"""
        lines = []
        
        # Flow Header
        status_emoji = self._get_flow_status_emoji(flow)
        quality_score = self._get_quality_score(flow)
        
        lines.extend([
            f"### {status_emoji} Flow #{flow_num}: {flow.src_ip}:{flow.src_port} → {flow.dst_ip}:{flow.dst_port}",
            ""
        ])
        
        # Basic Information Table
        lines.extend([
            "| Attribute | Value |",
            "|-----------|-------|",
            f"| **Protocol** | {flow.transport_protocol} |",
            f"| **Classification** | {flow.traffic_classification} |",
            f"| **Packets** | {flow.frame_count:,} |",
            f"| **Volume** | {self._format_bytes(flow.total_bytes)} |",
            f"| **Quality Score** | {quality_score}% |",
            f"| **Duration** | {flow.duration:.2f}s |",
            f"| **First Seen** | {self._format_timestamp(flow.first_seen)} |",
            f"| **Last Seen** | {self._format_timestamp(flow.last_seen)} |",
            ""
        ])
        
        # Enhanced Analysis (if available)
        if flow.enhanced_analysis.decoder_type != "Standard":
            lines.extend(self._generate_enhanced_analysis_section(flow))
        
        # Frame Type Breakdown
        if flow.frame_types:
            lines.extend(self._generate_frame_types_section(flow))
        
        # Timing Analysis
        lines.extend(self._generate_timing_analysis_section(flow))
        
        lines.append("")
        return lines
    
    def _generate_enhanced_analysis_section(self, flow: FlowStats) -> List[str]:
        """Generate enhanced analysis section"""
        ea = flow.enhanced_analysis
        
        lines = [
            "#### 🔬 Enhanced Protocol Analysis",
            "",
            "| Metric | Value |",
            "|--------|-------|",
            f"| **Decoder Type** | {ea.decoder_type} |",
            f"| **Frame Quality** | {ea.avg_frame_quality:.1f}% |",
            f"| **Field Count** | {ea.field_count} |",
            f"| **Timing Accuracy** | {ea.timing_accuracy:.1f}% |",
            f"| **Signal Quality** | {ea.signal_quality:.1f}% |"
        ]
        
        if ea.decoder_type.startswith("Chapter10"):
            lines.extend([
                f"| **Channel Count** | {ea.channel_count} |",
                f"| **Analog Channels** | {ea.analog_channels} |",
                f"| **PCM Channels** | {ea.pcm_channels} |",
                f"| **TMATS Frames** | {ea.tmats_frames} |",
                f"| **Clock Drift** | {ea.avg_clock_drift_ppm:.2f} ppm |",
                f"| **Timing Quality** | {ea.timing_quality} |"
            ])
        
        lines.extend(["", ""])
        return lines
    
    def _generate_frame_types_section(self, flow: FlowStats) -> List[str]:
        """Generate frame types breakdown section"""
        lines = [
            "#### 📦 Frame Type Analysis",
            "",
            "| Frame Type | Count | % | Avg ΔT | Std σ | Outliers | Outlier Frames |",
            "|------------|-------|---|---------|--------|----------|----------------|"
        ]
        
        # Sort frame types by count
        sorted_types = sorted(
            flow.frame_types.items(),
            key=lambda x: x[1].count,
            reverse=True
        )
        
        total_count = flow.frame_count
        for frame_type, stats in sorted_types:
            percentage = (stats.count / total_count * 100) if total_count > 0 else 0
            
            # Format timing values
            delta_t = ""
            if stats.avg_inter_arrival > 0:
                dt_ms = stats.avg_inter_arrival * 1000
                delta_t = f"{dt_ms:.1f}ms" if dt_ms < 1000 else f"{dt_ms/1000:.1f}s"
            
            sigma = ""
            if stats.std_inter_arrival > 0:
                sig_ms = stats.std_inter_arrival * 1000
                sigma = f"{sig_ms:.1f}ms" if sig_ms < 1000 else f"{sig_ms/1000:.1f}s"
            
            outliers = len(stats.outlier_frames)
            outlier_str = f"⚠️ {outliers}" if outliers > 0 else f"{outliers}"
            
            # Format outlier frames (show first 5)
            outlier_frames = ""
            if stats.outlier_frames:
                frames = sorted(stats.outlier_frames[:5])
                outlier_frames = ", ".join(map(str, frames))
                if len(stats.outlier_frames) > 5:
                    outlier_frames += f", +{len(stats.outlier_frames) - 5}"
            
            lines.append(
                f"| `{frame_type}` | {stats.count:,} | {percentage:.1f}% | {delta_t} | {sigma} | {outlier_str} | {outlier_frames} |"
            )
        
        lines.extend(["", ""])
        return lines
    
    def _generate_timing_analysis_section(self, flow: FlowStats) -> List[str]:
        """Generate timing analysis section"""
        lines = [
            "#### ⏱️ Timing Analysis",
            ""
        ]
        
        if len(flow.inter_arrival_times) < 2:
            lines.extend([
                "*Insufficient timing data for analysis*",
                ""
            ])
            return lines
        
        # Overall timing metrics
        avg_ms = flow.avg_inter_arrival * 1000
        std_ms = flow.std_inter_arrival * 1000
        jitter_ms = flow.jitter * 1000
        outlier_pct = len(flow.outlier_frames) / flow.frame_count * 100 if flow.frame_count > 0 else 0
        
        lines.extend([
            "| Timing Metric | Value |",
            "|---------------|-------|",
            f"| **Average Inter-arrival** | {avg_ms:.2f}ms |",
            f"| **Standard Deviation** | {std_ms:.2f}ms |",
            f"| **Jitter** | {jitter_ms:.2f}ms |",
            f"| **Outlier Percentage** | {outlier_pct:.1f}% |",
            f"| **Total Outliers** | {len(flow.outlier_frames)} |",
            ""
        ])
        
        # Outlier Frame Details
        if flow.outlier_frames:
            lines.extend([
                "##### 🚨 Outlier Frames",
                "",
                f"**Frame Numbers:** {', '.join(map(str, sorted(flow.outlier_frames)))}",
                ""
            ])
            
            if flow.outlier_details:
                lines.extend([
                    "| Frame # | Inter-arrival Time | Deviation |",
                    "|---------|-------------------|-----------|"
                ])
                
                # Show up to 20 outliers in detail
                for frame_num, inter_time in sorted(flow.outlier_details[:20]):
                    deviation = (inter_time - flow.avg_inter_arrival) / flow.std_inter_arrival if flow.std_inter_arrival > 0 else 0
                    lines.append(
                        f"| {frame_num} | {inter_time * 1000:.3f}ms | {deviation:.1f}σ |"
                    )
                
                if len(flow.outlier_details) > 20:
                    lines.append(f"| ... | +{len(flow.outlier_details) - 20} more | ... |")
                
                lines.append("")
        
        # Timing Quality Assessment
        if outlier_pct < 1:
            timing_assessment = "🟢 **Excellent** - Very stable timing"
        elif outlier_pct < 5:
            timing_assessment = "🟡 **Good** - Minor timing variations"
        elif outlier_pct < 10:
            timing_assessment = "🟠 **Fair** - Noticeable timing issues"
        else:
            timing_assessment = "🔴 **Poor** - Significant timing problems"
        
        lines.extend([
            f"**Timing Quality:** {timing_assessment}",
            ""
        ])
        
        return lines
    
    def _generate_statistics_summary(self, flows: List[FlowStats]) -> List[str]:
        """Generate overall statistics summary"""
        if not flows:
            return []
        
        # Calculate aggregate statistics
        total_packets = sum(flow.frame_count for flow in flows)
        total_bytes = sum(flow.total_bytes for flow in flows)
        total_outliers = sum(len(flow.outlier_frames) for flow in flows)
        
        # Protocol distribution
        protocol_counts = {}
        for flow in flows:
            proto = flow.transport_protocol
            protocol_counts[proto] = protocol_counts.get(proto, 0) + 1
        
        # Enhanced protocol distribution
        enhanced_types = {}
        for flow in flows:
            if flow.enhanced_analysis.decoder_type != "Standard":
                enhanced_types[flow.enhanced_analysis.decoder_type] = enhanced_types.get(flow.enhanced_analysis.decoder_type, 0) + 1
        
        lines = [
            "---",
            "",
            "## 📈 Statistical Summary",
            "",
            "### Protocol Distribution",
            "",
            "| Protocol | Flows | Percentage |",
            "|----------|-------|------------|"
        ]
        
        for protocol, count in sorted(protocol_counts.items(), key=lambda x: x[1], reverse=True):
            percentage = count / len(flows) * 100
            lines.append(f"| {protocol} | {count} | {percentage:.1f}% |")
        
        if enhanced_types:
            lines.extend([
                "",
                "### Enhanced Protocol Analysis",
                "",
                "| Enhanced Type | Flows | Percentage |",
                "|---------------|-------|------------|"
            ])
            
            for enhanced_type, count in sorted(enhanced_types.items(), key=lambda x: x[1], reverse=True):
                percentage = count / len(flows) * 100
                lines.append(f"| {enhanced_type} | {count} | {percentage:.1f}% |")
        
        lines.extend([
            "",
            "### Overall Metrics",
            "",
            f"- **Total Analysis Duration:** {max(f.last_seen for f in flows if f.last_seen > 0) - min(f.first_seen for f in flows if f.first_seen > 0):.2f}s",
            f"- **Average Packets per Flow:** {total_packets / len(flows):.1f}",
            f"- **Average Bytes per Flow:** {self._format_bytes(total_bytes // len(flows))}",
            f"- **Overall Outlier Rate:** {total_outliers / total_packets * 100:.2f}%",
            "",
            "---",
            "",
            "*Report generated by StreamLens Network Analysis Tool*"
        ])
        
        return lines
    
    def _get_flow_status_emoji(self, flow: FlowStats) -> str:
        """Get emoji for flow status"""
        if flow.enhanced_analysis.decoder_type != "Standard":
            return "🔬"  # Enhanced
        elif len(flow.outlier_frames) > flow.frame_count * 0.1:
            return "⚠️"   # Alert
        elif len(flow.outlier_frames) > 0:
            return "⚡"   # Warning
        else:
            return "✅"   # Normal
    
    def _get_quality_score(self, flow: FlowStats) -> int:
        """Calculate quality score for flow"""
        if flow.enhanced_analysis.decoder_type != "Standard":
            return int(flow.enhanced_analysis.avg_frame_quality)
        else:
            # Base quality on outlier percentage
            outlier_pct = len(flow.outlier_frames) / flow.frame_count * 100 if flow.frame_count > 0 else 0
            return max(0, int(100 - outlier_pct * 10))
    
    def _format_bytes(self, bytes_count: int) -> str:
        """Format byte count with units"""
        if bytes_count >= 1_000_000_000:
            return f"{bytes_count / 1_000_000_000:.2f} GB"
        elif bytes_count >= 1_000_000:
            return f"{bytes_count / 1_000_000:.2f} MB"
        elif bytes_count >= 1_000:
            return f"{bytes_count / 1_000:.2f} KB"
        else:
            return f"{bytes_count} B"
    
    def _format_timestamp(self, timestamp: float) -> str:
        """Format timestamp for display"""
        if timestamp == 0:
            return "N/A"
        dt = datetime.datetime.fromtimestamp(timestamp)
        return dt.strftime("%H:%M:%S.%f")[:-3]