first working

2025-08-03 20:20:55 -04:00
commit cde56494ec
52 changed files with 1893 additions and 0 deletions
--- a/pyshark_poc/analyzer.py
+++ b/pyshark_poc/analyzer.py
@@ -0,0 +1,187 @@
+import pyshark
+from collections import defaultdict
+from typing import Optional, List, Type, Union
+import pandas as pd
+from tabulate import tabulate
+
+from .models import FlowKey
+from .stats import MultiStats, BaseStats, STATS_TYPES
+
+
+class PySharkAnalyzer:
+    """Packet flow analyzer using PyShark for Wireshark dissector support."""
+    
+    def __init__(self, stats_classes: Optional[List[Type[BaseStats]]] = None):
+        if stats_classes is None:
+            stats_classes = [STATS_TYPES['overview']]
+        self.stats_classes = stats_classes
+        self.flows = defaultdict(lambda: MultiStats(stats_classes))
+        self.packet_count = 0
+    
+    def _get_flow_key(self, packet) -> Optional[FlowKey]:
+        """Extract flow key from PyShark packet."""
+        try:
+            # Check for IP layer
+            if not hasattr(packet, 'ip'):
+                return None
+            
+            src_ip = packet.ip.src
+            dst_ip = packet.ip.dst
+            protocol = packet.transport_layer if hasattr(packet, 'transport_layer') else 'IP'
+            
+            # Get ports based on protocol
+            src_port = 0
+            dst_port = 0
+            
+            if hasattr(packet, 'tcp'):
+                src_port = int(packet.tcp.srcport)
+                dst_port = int(packet.tcp.dstport)
+                protocol = 'TCP'
+            elif hasattr(packet, 'udp'):
+                src_port = int(packet.udp.srcport)
+                dst_port = int(packet.udp.dstport)
+                protocol = 'UDP'
+            
+            # Check for extended protocol types
+            extended_type = None
+            if hasattr(packet, 'ptp'):
+                extended_type = 'PTP'
+            # Add more protocol detection here as needed
+            
+            return FlowKey(src_ip, src_port, dst_ip, dst_port, protocol, extended_type)
+            
+        except AttributeError:
+            return None
+    
+    def _process_packet(self, packet):
+        """Process a single packet."""
+        key = self._get_flow_key(packet)
+        if key:
+            # Get timestamp and size
+            timestamp = float(packet.sniff_timestamp) if hasattr(packet, 'sniff_timestamp') else 0
+            size = int(packet.length) if hasattr(packet, 'length') else 0
+            
+            self.flows[key].add(timestamp, size, packet)
+            self.packet_count += 1
+    
+    def analyze_pcap(self, file: str, display_filter: Optional[str] = None):
+        """Analyze packets from a PCAP file."""
+        print(f"Analyzing: {file}")
+        if display_filter:
+            print(f"Filter: {display_filter}")
+        
+        try:
+            # Use FileCapture for PCAP files
+            capture = pyshark.FileCapture(
+                file, 
+                display_filter=display_filter,
+                use_json=True,  # Use JSON output for better performance
+                include_raw=False  # Don't include raw packet data
+            )
+            
+            # Process packets
+            for packet in capture:
+                self._process_packet(packet)
+                # Show progress every 1000 packets
+                if self.packet_count % 1000 == 0:
+                    print(f"  Processed {self.packet_count} packets...")
+            
+            capture.close()
+            print(f"Found {len(self.flows)} flows from {self.packet_count} packets")
+            
+        except Exception as e:
+            print(f"Error analyzing PCAP: {e}")
+    
+    def analyze_live(self, interface: str, count: int = 100, 
+                    display_filter: Optional[str] = None, 
+                    bpf_filter: Optional[str] = None):
+        """Capture and analyze packets from a live interface."""
+        print(f"Capturing {count} packets on {interface}")
+        if display_filter:
+            print(f"Display filter: {display_filter}")
+        if bpf_filter:
+            print(f"BPF filter: {bpf_filter}")
+        
+        try:
+            # Use LiveCapture for live capture
+            capture = pyshark.LiveCapture(
+                interface=interface,
+                display_filter=display_filter,
+                bpf_filter=bpf_filter,
+                use_json=True,
+                include_raw=False
+            )
+            
+            # Capture packets
+            capture.sniff(packet_count=count)
+            
+            # Process captured packets
+            for packet in capture:
+                self._process_packet(packet)
+            
+            capture.close()
+            print(f"Found {len(self.flows)} flows from {self.packet_count} packets")
+            
+        except Exception as e:
+            print(f"Error during live capture: {e}")
+    
+    def summary(self) -> pd.DataFrame:
+        """Generate summary DataFrame of all flows."""
+        rows = []
+        for key, multi_stats in self.flows.items():
+            row = {
+                'Src IP': key.src_ip,
+                'Src Port': key.src_port,
+                'Dst IP': key.dst_ip,
+                'Dst Port': key.dst_port,
+                'Proto': key.protocol
+            }
+            if key.extended_type:
+                row['Type'] = key.extended_type
+            row.update(multi_stats.get_combined_summary())
+            rows.append(row)
+        
+        # Sort by packet count descending
+        df = pd.DataFrame(rows)
+        if not df.empty and 'Pkts' in df.columns:
+            df = df.sort_values('Pkts', ascending=False)
+        return df
+    
+    def print_summary(self):
+        """Print formatted summary of flows."""
+        df = self.summary()
+        if df.empty:
+            print("No flows detected")
+            return
+        
+        print(f"\n{len(df)} flows:")
+        print(tabulate(df, headers='keys', tablefmt='plain', showindex=False))
+        
+        if 'Pkts' in df.columns and 'Bytes' in df.columns:
+            print(f"\nTotals: {df['Pkts'].sum()} packets, {df['Bytes'].sum()} bytes")
+    
+    def get_protocol_summary(self) -> pd.DataFrame:
+        """Get summary grouped by protocol."""
+        df = self.summary()
+        if df.empty:
+            return df
+        
+        # Group by protocol
+        protocol_summary = df.groupby('Proto').agg({
+            'Pkts': 'sum',
+            'Bytes': 'sum'
+        }).reset_index()
+        
+        return protocol_summary
+    
+    def apply_wireshark_filter(self, display_filter: str):
+        """
+        Apply a Wireshark display filter to the analysis.
+        This demonstrates PyShark's ability to use Wireshark's filtering.
+        """
+        filtered_flows = defaultdict(lambda: MultiStats(self.stats_classes))
+        
+        # This would require re-processing with the filter
+        # Shown here as an example of the capability
+        print(f"Note: To apply Wireshark filters, re-analyze with display_filter parameter")
+        return filtered_flows