StreamLens/debug_outlier_discrepancy.py

#!/usr/bin/env python3
"""Debug outlier count discrepancy"""

import sys
sys.path.append('.')

from analyzer.analysis import EthernetAnalyzer
from analyzer.utils import PCAPLoader
from analyzer.analysis.background_analyzer import BackgroundAnalyzer
import time

def debug_outliers(pcap_file, src_ip="192.168.4.89"):
    """Debug outlier detection differences"""
    
    print("=== METHOD 1: Direct Processing ===")
    # Method 1: Direct processing (like my debug script)
    analyzer1 = EthernetAnalyzer(outlier_threshold_sigma=3.0)
    loader = PCAPLoader(pcap_file)
    packets = loader.load_all()
    
    for i, packet in enumerate(packets, 1):
        analyzer1._process_single_packet(packet, i)
    
    analyzer1.calculate_statistics()
    
    # Find flow
    flow1 = None
    for flow_key, flow in analyzer1.flows.items():
        if flow.src_ip == src_ip:
            flow1 = flow
            break
    
    if flow1:
        print(f"Flow: {flow1.src_ip}:{flow1.src_port} -> {flow1.dst_ip}:{flow1.dst_port}")
        print(f"Packets: {flow1.frame_count}")
        print(f"Outliers: {len(flow1.outlier_frames)}")
        print(f"Outlier frames: {sorted(flow1.outlier_frames)[:20]}")
        print(f"Avg ΔT: {flow1.avg_inter_arrival * 1000:.3f} ms")
        print(f"Std σ: {flow1.std_inter_arrival * 1000:.3f} ms")
        print(f"3σ threshold: {(flow1.avg_inter_arrival + 3 * flow1.std_inter_arrival) * 1000:.3f} ms")
    
    print("\n=== METHOD 2: Background Processing (TUI) ===")
    # Method 2: Background processing (like TUI)
    analyzer2 = EthernetAnalyzer(outlier_threshold_sigma=3.0)
    bg_analyzer = BackgroundAnalyzer(analyzer2)
    
    bg_analyzer.start_parsing(pcap_file)
    
    # Wait for completion
    while bg_analyzer.is_parsing:
        time.sleep(0.1)
    
    # Find flow
    flow2 = None
    for flow_key, flow in analyzer2.flows.items():
        if flow.src_ip == src_ip:
            flow2 = flow
            break
    
    if flow2:
        print(f"Flow: {flow2.src_ip}:{flow2.src_port} -> {flow2.dst_ip}:{flow2.dst_port}")
        print(f"Packets: {flow2.frame_count}")
        print(f"Outliers: {len(flow2.outlier_frames)}")
        print(f"Outlier frames: {sorted(flow2.outlier_frames)[:20]}")
        print(f"Avg ΔT: {flow2.avg_inter_arrival * 1000:.3f} ms")
        print(f"Std σ: {flow2.std_inter_arrival * 1000:.3f} ms")
        print(f"3σ threshold: {(flow2.avg_inter_arrival + 3 * flow2.std_inter_arrival) * 1000:.3f} ms")
    
    # Compare results
    print("\n=== COMPARISON ===")
    if flow1 and flow2:
        print(f"Direct outliers: {len(flow1.outlier_frames)}")
        print(f"Background outliers: {len(flow2.outlier_frames)}")
        
        if len(flow1.outlier_frames) != len(flow2.outlier_frames):
            print("\n⚠️ OUTLIER COUNT MISMATCH!")
            
            # Find differences
            set1 = set(flow1.outlier_frames)
            set2 = set(flow2.outlier_frames)
            
            only_in_1 = set1 - set2
            only_in_2 = set2 - set1
            
            if only_in_1:
                print(f"Only in direct: {sorted(only_in_1)}")
            if only_in_2:
                print(f"Only in background: {sorted(only_in_2)}")
            
            # Check timing differences
            print("\nTiming comparison:")
            print(f"Direct - Avg: {flow1.avg_inter_arrival * 1000:.6f} ms, Std: {flow1.std_inter_arrival * 1000:.6f} ms")
            print(f"Background - Avg: {flow2.avg_inter_arrival * 1000:.6f} ms, Std: {flow2.std_inter_arrival * 1000:.6f} ms")
            
            # Check inter-arrival times length
            print(f"\nInter-arrival times count:")
            print(f"Direct: {len(flow1.inter_arrival_times)}")
            print(f"Background: {len(flow2.inter_arrival_times)}")
            
            # Check first few inter-arrival times
            print("\nFirst 10 inter-arrival times comparison:")
            for i in range(min(10, len(flow1.inter_arrival_times), len(flow2.inter_arrival_times))):
                t1 = flow1.inter_arrival_times[i] * 1000
                t2 = flow2.inter_arrival_times[i] * 1000
                diff = abs(t1 - t2)
                print(f"  [{i}] Direct: {t1:.6f} ms, Background: {t2:.6f} ms, Diff: {diff:.6f} ms")
        else:
            print("✅ Outlier counts match!")

if __name__ == "__main__":
    if len(sys.argv) > 1:
        debug_outliers(sys.argv[1])
    else:
        debug_outliers("1 PTPGM.pcapng")