Update notification service

2026-03-30 18:53:03 +02:00
parent cb9a43f496
commit 54eab9af49
7 changed files with 1106 additions and 35 deletions
@@ -0,0 +1,375 @@
+#!/usr/bin/env python3
+"""
+AI Context Enrichment Module
+
+Enriches notification context with additional information to help AI provide
+more accurate and helpful responses:
+
+1. Event frequency - how often this error has occurred
+2. System uptime - helps distinguish startup issues from runtime failures
+3. SMART disk data - for disk-related errors
+4. Known error matching - from proxmox_known_errors database
+
+Author: MacRimi
+"""
+
+import os
+import re
+import subprocess
+from datetime import datetime, timedelta
+from typing import Optional, Dict, Any
+import sqlite3
+from pathlib import Path
+
+# Import known errors database
+try:
+    from proxmox_known_errors import get_error_context, find_matching_error
+except ImportError:
+    def get_error_context(*args, **kwargs):
+        return None
+    def find_matching_error(*args, **kwargs):
+        return None
+
+DB_PATH = Path('/usr/local/share/proxmenux/health_monitor.db')
+
+
+def get_system_uptime() -> str:
+    """Get system uptime in human-readable format.
+    
+    Returns:
+        String like "2 minutes (recently booted)" or "89 days, 4 hours (stable system)"
+    """
+    try:
+        with open('/proc/uptime', 'r') as f:
+            uptime_seconds = float(f.readline().split()[0])
+        
+        days = int(uptime_seconds // 86400)
+        hours = int((uptime_seconds % 86400) // 3600)
+        minutes = int((uptime_seconds % 3600) // 60)
+        
+        # Build human-readable string
+        parts = []
+        if days > 0:
+            parts.append(f"{days} day{'s' if days != 1 else ''}")
+        if hours > 0:
+            parts.append(f"{hours} hour{'s' if hours != 1 else ''}")
+        if not parts:  # Less than an hour
+            parts.append(f"{minutes} minute{'s' if minutes != 1 else ''}")
+        
+        uptime_str = ", ".join(parts)
+        
+        # Add context hint
+        if uptime_seconds < 600:  # Less than 10 minutes
+            return f"{uptime_str} (just booted - likely startup issue)"
+        elif uptime_seconds < 3600:  # Less than 1 hour
+            return f"{uptime_str} (recently booted)"
+        elif days >= 30:
+            return f"{uptime_str} (stable system)"
+        else:
+            return uptime_str
+            
+    except Exception:
+        return "unknown"
+
+
+def get_event_frequency(error_id: str = None, error_key: str = None, 
+                        category: str = None, hours: int = 24) -> Optional[Dict[str, Any]]:
+    """Get frequency information for an error from the database.
+    
+    Args:
+        error_id: Specific error ID to look up
+        error_key: Alternative error key
+        category: Error category
+        hours: Time window to check (default 24h)
+        
+    Returns:
+        Dict with frequency info or None
+    """
+    if not DB_PATH.exists():
+        return None
+    
+    try:
+        conn = sqlite3.connect(str(DB_PATH), timeout=5)
+        cursor = conn.cursor()
+        
+        # Try to find the error
+        if error_id:
+            cursor.execute('''
+                SELECT first_seen, last_seen, occurrences, category 
+                FROM errors WHERE error_key = ? OR error_id = ?
+                ORDER BY last_seen DESC LIMIT 1
+            ''', (error_id, error_id))
+        elif error_key:
+            cursor.execute('''
+                SELECT first_seen, last_seen, occurrences, category 
+                FROM errors WHERE error_key = ?
+                ORDER BY last_seen DESC LIMIT 1
+            ''', (error_key,))
+        elif category:
+            cursor.execute('''
+                SELECT first_seen, last_seen, occurrences, category 
+                FROM errors WHERE category = ? AND resolved_at IS NULL
+                ORDER BY last_seen DESC LIMIT 1
+            ''', (category,))
+        else:
+            conn.close()
+            return None
+        
+        row = cursor.fetchone()
+        conn.close()
+        
+        if not row:
+            return None
+        
+        first_seen, last_seen, occurrences, cat = row
+        
+        # Calculate age
+        try:
+            first_dt = datetime.fromisoformat(first_seen) if first_seen else None
+            last_dt = datetime.fromisoformat(last_seen) if last_seen else None
+            now = datetime.now()
+            
+            result = {
+                'occurrences': occurrences or 1,
+                'category': cat
+            }
+            
+            if first_dt:
+                age = now - first_dt
+                if age.total_seconds() < 3600:
+                    result['first_seen_ago'] = f"{int(age.total_seconds() / 60)} minutes ago"
+                elif age.total_seconds() < 86400:
+                    result['first_seen_ago'] = f"{int(age.total_seconds() / 3600)} hours ago"
+                else:
+                    result['first_seen_ago'] = f"{age.days} days ago"
+            
+            if last_dt and first_dt and occurrences and occurrences > 1:
+                # Calculate average interval
+                span = (last_dt - first_dt).total_seconds()
+                if span > 0 and occurrences > 1:
+                    avg_interval = span / (occurrences - 1)
+                    if avg_interval < 60:
+                        result['pattern'] = f"recurring every ~{int(avg_interval)} seconds"
+                    elif avg_interval < 3600:
+                        result['pattern'] = f"recurring every ~{int(avg_interval / 60)} minutes"
+                    else:
+                        result['pattern'] = f"recurring every ~{int(avg_interval / 3600)} hours"
+            
+            return result
+            
+        except (ValueError, TypeError):
+            return {'occurrences': occurrences or 1, 'category': cat}
+            
+    except Exception as e:
+        print(f"[AIContext] Error getting frequency: {e}")
+        return None
+
+
+def get_smart_data(disk_device: str) -> Optional[str]:
+    """Get SMART health data for a disk.
+    
+    Args:
+        disk_device: Device path like /dev/sda or just sda
+        
+    Returns:
+        Formatted SMART summary or None
+    """
+    if not disk_device:
+        return None
+    
+    # Normalize device path
+    if not disk_device.startswith('/dev/'):
+        disk_device = f'/dev/{disk_device}'
+    
+    # Check device exists
+    if not os.path.exists(disk_device):
+        return None
+    
+    try:
+        # Get health status
+        result = subprocess.run(
+            ['smartctl', '-H', disk_device],
+            capture_output=True, text=True, timeout=10
+        )
+        
+        health_status = "UNKNOWN"
+        if "PASSED" in result.stdout:
+            health_status = "PASSED"
+        elif "FAILED" in result.stdout:
+            health_status = "FAILED"
+        
+        # Get key attributes
+        result = subprocess.run(
+            ['smartctl', '-A', disk_device],
+            capture_output=True, text=True, timeout=10
+        )
+        
+        attributes = {}
+        critical_attrs = [
+            'Reallocated_Sector_Ct', 'Current_Pending_Sector', 
+            'Offline_Uncorrectable', 'UDMA_CRC_Error_Count',
+            'Reallocated_Event_Count', 'Reported_Uncorrect'
+        ]
+        
+        for line in result.stdout.split('\n'):
+            for attr in critical_attrs:
+                if attr in line:
+                    parts = line.split()
+                    # Typical format: ID ATTRIBUTE_NAME FLAGS VALUE WORST THRESH TYPE UPDATED RAW_VALUE
+                    if len(parts) >= 10:
+                        raw_value = parts[-1]
+                        attributes[attr] = raw_value
+        
+        # Build summary
+        lines = [f"SMART Health: {health_status}"]
+        
+        # Add critical attributes if non-zero
+        for attr, value in attributes.items():
+            try:
+                if int(value) > 0:
+                    lines.append(f"  {attr}: {value}")
+            except ValueError:
+                pass
+        
+        return "\n".join(lines) if len(lines) > 1 or health_status == "FAILED" else f"SMART Health: {health_status}"
+        
+    except subprocess.TimeoutExpired:
+        return None
+    except FileNotFoundError:
+        # smartctl not installed
+        return None
+    except Exception:
+        return None
+
+
+def extract_disk_device(text: str) -> Optional[str]:
+    """Extract disk device name from error text.
+    
+    Args:
+        text: Error message or log content
+        
+    Returns:
+        Device name like 'sda' or None
+    """
+    if not text:
+        return None
+    
+    # Common patterns for disk devices in errors
+    patterns = [
+        r'/dev/(sd[a-z]\d*)',
+        r'/dev/(nvme\d+n\d+(?:p\d+)?)',
+        r'/dev/(hd[a-z]\d*)',
+        r'/dev/(vd[a-z]\d*)',
+        r'\b(sd[a-z])\b',
+        r'disk[_\s]+(sd[a-z])',
+        r'ata\d+\.\d+: (sd[a-z])',
+    ]
+    
+    for pattern in patterns:
+        match = re.search(pattern, text, re.IGNORECASE)
+        if match:
+            return match.group(1)
+    
+    return None
+
+
+def enrich_context_for_ai(
+    title: str,
+    body: str,
+    event_type: str,
+    data: Dict[str, Any],
+    journal_context: str = '',
+    detail_level: str = 'standard'
+) -> str:
+    """Build enriched context string for AI processing.
+    
+    Combines:
+    - Original journal context
+    - Event frequency information
+    - System uptime
+    - SMART data (for disk errors)
+    - Known error matching
+    
+    Args:
+        title: Notification title
+        body: Notification body
+        event_type: Type of event
+        data: Event data dict
+        journal_context: Original journal log context
+        detail_level: Level of detail (minimal, standard, detailed)
+        
+    Returns:
+        Enriched context string
+    """
+    context_parts = []
+    combined_text = f"{title} {body} {journal_context}"
+    
+    # 1. System uptime (always useful)
+    uptime = get_system_uptime()
+    if uptime and uptime != "unknown":
+        context_parts.append(f"System uptime: {uptime}")
+    
+    # 2. Event frequency
+    error_key = data.get('error_key') or data.get('error_id')
+    category = data.get('category')
+    
+    freq = get_event_frequency(error_id=error_key, category=category)
+    if freq:
+        freq_line = f"Event frequency: {freq.get('occurrences', 1)} occurrence(s)"
+        if freq.get('first_seen_ago'):
+            freq_line += f", first seen {freq['first_seen_ago']}"
+        if freq.get('pattern'):
+            freq_line += f", {freq['pattern']}"
+        context_parts.append(freq_line)
+    
+    # 3. SMART data for disk-related events
+    disk_related = any(x in event_type.lower() for x in ['disk', 'smart', 'storage', 'io_error'])
+    if not disk_related:
+        disk_related = any(x in combined_text.lower() for x in ['disk', 'smart', '/dev/sd', 'ata', 'i/o error'])
+    
+    if disk_related:
+        disk_device = extract_disk_device(combined_text)
+        if disk_device:
+            smart_data = get_smart_data(disk_device)
+            if smart_data:
+                context_parts.append(smart_data)
+    
+    # 4. Known error matching
+    known_error_ctx = get_error_context(combined_text, category=category, detail_level=detail_level)
+    if known_error_ctx:
+        context_parts.append(known_error_ctx)
+    
+    # 5. Add original journal context
+    if journal_context:
+        context_parts.append(f"Journal logs:\n{journal_context}")
+    
+    # Combine all parts
+    if context_parts:
+        return "\n\n".join(context_parts)
+    
+    return journal_context or ""
+
+
+def get_enriched_context(
+    event: 'NotificationEvent',
+    detail_level: str = 'standard'
+) -> str:
+    """Convenience function to enrich context from a NotificationEvent.
+    
+    Args:
+        event: NotificationEvent object
+        detail_level: Level of detail
+        
+    Returns:
+        Enriched context string
+    """
+    journal_context = event.data.get('_journal_context', '')
+    
+    return enrich_context_for_ai(
+        title=event.data.get('title', ''),
+        body=event.data.get('body', event.data.get('message', '')),
+        event_type=event.event_type,
+        data=event.data,
+        journal_context=journal_context,
+        detail_level=detail_level
+    )
@@ -95,6 +95,8 @@ cp "$SCRIPT_DIR/notification_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo
 cp "$SCRIPT_DIR/notification_channels.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  notification_channels.py not found"
 cp "$SCRIPT_DIR/notification_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  notification_templates.py not found"
 cp "$SCRIPT_DIR/notification_events.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  notification_events.py not found"
+cp "$SCRIPT_DIR/proxmox_known_errors.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  proxmox_known_errors.py not found"
+cp "$SCRIPT_DIR/ai_context_enrichment.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  ai_context_enrichment.py not found"
 cp "$SCRIPT_DIR/startup_grace.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  startup_grace.py not found"
 cp "$SCRIPT_DIR/flask_notification_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  flask_notification_routes.py not found"
 cp "$SCRIPT_DIR/oci_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️  oci_manager.py not found"
@@ -862,6 +862,307 @@ class HealthPersistence:
        
        conn.commit()
        conn.close()
+        
+        # Clean up errors for resources that no longer exist (VMs/CTs deleted, disks removed)
+        self._cleanup_stale_resources()
+    
+    def _cleanup_stale_resources(self):
+        """Resolve errors for resources that no longer exist.
+        
+        Comprehensive cleanup for ALL error categories:
+        - VMs/CTs: deleted resources (not just stopped)
+        - Disks: physically removed devices, ZFS pools, storage
+        - Network: removed interfaces, bonds, bridges
+        - Services/pve_services: services on deleted CTs, stopped services
+        - Logs: persistent/spike/cascade errors older than 48h
+        - Cluster: errors when node is no longer in cluster
+        - Temperature: sensors that no longer exist
+        - Memory/Storage: mount points that no longer exist
+        - Updates/Security: acknowledged errors older than 7 days
+        - General fallback: any error older than 7 days with no recent activity
+        """
+        import subprocess
+        import re
+        
+        conn = self._get_conn()
+        cursor = conn.cursor()
+        now = datetime.now()
+        now_iso = now.isoformat()
+        
+        # Get all active (unresolved) errors with first_seen and last_seen for age checks
+        cursor.execute('''
+            SELECT id, error_key, category, message, first_seen, last_seen, severity FROM errors 
+            WHERE resolved_at IS NULL
+        ''')
+        active_errors = cursor.fetchall()
+        
+        resolved_count = 0
+        
+        # Cache for expensive checks (avoid repeated subprocess calls)
+        _vm_ct_exists_cache = {}
+        _cluster_status_cache = None
+        _network_interfaces_cache = None
+        _zfs_pools_cache = None
+        _mount_points_cache = None
+        _pve_services_cache = None
+        
+        def check_vm_ct_cached(vmid):
+            if vmid not in _vm_ct_exists_cache:
+                _vm_ct_exists_cache[vmid] = self._check_vm_ct_exists(vmid)
+            return _vm_ct_exists_cache[vmid]
+        
+        def get_cluster_status():
+            nonlocal _cluster_status_cache
+            if _cluster_status_cache is None:
+                try:
+                    result = subprocess.run(
+                        ['pvecm', 'status'],
+                        capture_output=True, text=True, timeout=5
+                    )
+                    _cluster_status_cache = {
+                        'is_cluster': result.returncode == 0 and 'Cluster information' in result.stdout,
+                        'nodes': result.stdout if result.returncode == 0 else ''
+                    }
+                except Exception:
+                    _cluster_status_cache = {'is_cluster': True, 'nodes': ''}  # Assume cluster on error
+            return _cluster_status_cache
+        
+        def get_network_interfaces():
+            nonlocal _network_interfaces_cache
+            if _network_interfaces_cache is None:
+                try:
+                    import psutil
+                    _network_interfaces_cache = set(psutil.net_if_stats().keys())
+                except Exception:
+                    _network_interfaces_cache = set()
+            return _network_interfaces_cache
+        
+        def get_zfs_pools():
+            nonlocal _zfs_pools_cache
+            if _zfs_pools_cache is None:
+                try:
+                    result = subprocess.run(
+                        ['zpool', 'list', '-H', '-o', 'name'],
+                        capture_output=True, text=True, timeout=5
+                    )
+                    if result.returncode == 0:
+                        _zfs_pools_cache = set(result.stdout.strip().split('\n'))
+                    else:
+                        _zfs_pools_cache = set()
+                except Exception:
+                    _zfs_pools_cache = set()
+            return _zfs_pools_cache
+        
+        def get_mount_points():
+            nonlocal _mount_points_cache
+            if _mount_points_cache is None:
+                try:
+                    import psutil
+                    _mount_points_cache = set(p.mountpoint for p in psutil.disk_partitions(all=True))
+                except Exception:
+                    _mount_points_cache = set()
+            return _mount_points_cache
+        
+        def get_pve_services_status():
+            nonlocal _pve_services_cache
+            if _pve_services_cache is None:
+                _pve_services_cache = {}
+                try:
+                    result = subprocess.run(
+                        ['systemctl', 'list-units', '--type=service', '--all', '--no-legend'],
+                        capture_output=True, text=True, timeout=10
+                    )
+                    if result.returncode == 0:
+                        for line in result.stdout.strip().split('\n'):
+                            parts = line.split()
+                            if parts:
+                                service_name = parts[0].replace('.service', '')
+                                _pve_services_cache[service_name] = 'active' in line
+                except Exception:
+                    pass
+            return _pve_services_cache
+        
+        def extract_vmid_from_text(text):
+            """Extract VM/CT ID from error message or key."""
+            if not text:
+                return None
+            # Patterns: "VM 100", "CT 100", "vm_100_", "ct_100_", "VMID 100", etc.
+            match = re.search(r'(?:VM|CT|VMID|CTID|vm_|ct_)[\s_]?(\d{3,})', text, re.IGNORECASE)
+            return match.group(1) if match else None
+        
+        def get_age_hours(timestamp_str):
+            """Get age in hours from ISO timestamp string."""
+            if not timestamp_str:
+                return 0
+            try:
+                dt = datetime.fromisoformat(timestamp_str)
+                return (now - dt).total_seconds() / 3600
+            except (ValueError, TypeError):
+                return 0
+        
+        for error_row in active_errors:
+            err_id, error_key, category, message, first_seen, last_seen, severity = error_row
+            should_resolve = False
+            resolution_reason = None
+            age_hours = get_age_hours(first_seen)
+            last_seen_hours = get_age_hours(last_seen)
+            
+            # === VM/CT ERRORS ===
+            # Check if VM/CT still exists (covers: vms category, vm_*, ct_* error keys)
+            if category == 'vms' or (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_'))):
+                vmid = extract_vmid_from_text(error_key) or extract_vmid_from_text(message)
+                if vmid and not check_vm_ct_cached(vmid):
+                    should_resolve = True
+                    resolution_reason = 'VM/CT deleted'
+            
+            # === DISK ERRORS ===
+            # Check if disk device or ZFS pool still exists
+            elif category == 'disks' or category == 'storage':
+                if error_key:
+                    # Check for ZFS pool errors (e.g., "zfs_pool_rpool_degraded")
+                    zfs_match = re.search(r'zfs_(?:pool_)?([a-zA-Z0-9_-]+)', error_key)
+                    if zfs_match:
+                        pool_name = zfs_match.group(1)
+                        pools = get_zfs_pools()
+                        if pools and pool_name not in pools:
+                            should_resolve = True
+                            resolution_reason = 'ZFS pool removed'
+                    
+                    # Check for disk device errors (e.g., "disk_sdh_io_error", "smart_sda_failing")
+                    if not should_resolve:
+                        disk_match = re.search(r'(?:disk_|smart_|io_error_)([a-z]{2,4}\d*)', error_key)
+                        if disk_match:
+                            disk_name = disk_match.group(1)
+                            disk_path = f'/dev/{disk_name}'
+                            if not os.path.exists(disk_path):
+                                should_resolve = True
+                                resolution_reason = 'Disk device removed'
+                    
+                    # Check for mount point errors (e.g., "disk_fs_/mnt/data")
+                    if not should_resolve and 'disk_fs_' in error_key:
+                        mount = error_key.replace('disk_fs_', '').split('_')[0]
+                        if mount.startswith('/'):
+                            mounts = get_mount_points()
+                            if mounts and mount not in mounts:
+                                should_resolve = True
+                                resolution_reason = 'Mount point removed'
+            
+            # === NETWORK ERRORS ===
+            # Check if network interface still exists
+            elif category == 'network':
+                if error_key:
+                    # Extract interface name (e.g., "net_vmbr1_down" -> "vmbr1", "bond0_slave_error" -> "bond0")
+                    iface_match = re.search(r'(?:net_|bond_|vmbr|eth|eno|ens|enp)([a-zA-Z0-9_]+)?', error_key)
+                    if iface_match:
+                        # Reconstruct full interface name
+                        full_match = re.search(r'((?:vmbr|bond|eth|eno|ens|enp)[a-zA-Z0-9]+)', error_key)
+                        if full_match:
+                            iface = full_match.group(1)
+                            interfaces = get_network_interfaces()
+                            if interfaces and iface not in interfaces:
+                                should_resolve = True
+                                resolution_reason = 'Network interface removed'
+            
+            # === SERVICE ERRORS ===
+            # Check if service exists or if it references a deleted CT
+            elif category in ('services', 'pve_services'):
+                # First check if it references a CT that no longer exists
+                vmid = extract_vmid_from_text(message) or extract_vmid_from_text(error_key)
+                if vmid and not check_vm_ct_cached(vmid):
+                    should_resolve = True
+                    resolution_reason = 'Container deleted'
+                
+                # For pve_services, check if the service unit exists
+                if not should_resolve and category == 'pve_services' and error_key:
+                    service_match = re.search(r'service_([a-zA-Z0-9_-]+)', error_key)
+                    if service_match:
+                        service_name = service_match.group(1)
+                        services = get_pve_services_status()
+                        if services and service_name not in services:
+                            should_resolve = True
+                            resolution_reason = 'Service no longer exists'
+            
+            # === LOG ERRORS ===
+            # Auto-resolve log errors after 48h (they represent point-in-time issues)
+            elif category == 'logs' or (error_key and error_key.startswith(('log_persistent_', 'log_spike_', 'log_cascade_', 'log_critical_'))):
+                if age_hours > 48:
+                    should_resolve = True
+                    resolution_reason = 'Log error aged out (>48h)'
+            
+            # === CLUSTER ERRORS ===
+            # Resolve cluster/corosync/qdevice errors if node is no longer in a cluster
+            elif error_key and any(x in error_key.lower() for x in ('cluster', 'corosync', 'qdevice', 'quorum')):
+                cluster_info = get_cluster_status()
+                if not cluster_info['is_cluster']:
+                    should_resolve = True
+                    resolution_reason = 'No longer in cluster'
+            
+            # === TEMPERATURE ERRORS ===
+            # Temperature errors - check if sensor still exists (unlikely to change, resolve after 24h of no activity)
+            elif category == 'temperature':
+                if last_seen_hours > 24:
+                    should_resolve = True
+                    resolution_reason = 'Temperature error stale (>24h no activity)'
+            
+            # === UPDATES/SECURITY ERRORS ===
+            # These are informational - auto-resolve after 7 days if acknowledged or stale
+            elif category in ('updates', 'security'):
+                if age_hours > 168:  # 7 days
+                    should_resolve = True
+                    resolution_reason = 'Update/security notice aged out (>7d)'
+            
+            # === FALLBACK: ANY STALE ERROR ===
+            # Any error that hasn't been seen in 7 days and is older than 7 days
+            if not should_resolve and age_hours > 168 and last_seen_hours > 168:
+                should_resolve = True
+                resolution_reason = 'Stale error (no activity >7d)'
+            
+            if should_resolve:
+                cursor.execute('''
+                    UPDATE errors SET resolved_at = ?, resolution_type = 'auto'
+                    WHERE id = ?
+                ''', (now_iso, err_id))
+                resolved_count += 1
+        
+        if resolved_count > 0:
+            conn.commit()
+            print(f"[HealthPersistence] Auto-resolved {resolved_count} errors for stale/deleted resources")
+        
+        conn.close()
+    
+    def _check_vm_ct_exists(self, vmid: str) -> bool:
+        """Check if a VM or CT exists (not just running, but exists at all).
+        
+        Uses 'qm config' and 'pct config' which return success even for stopped VMs/CTs,
+        but fail if the VM/CT doesn't exist.
+        """
+        import subprocess
+        
+        try:
+            # Try VM first
+            result = subprocess.run(
+                ['qm', 'config', vmid],
+                capture_output=True,
+                text=True,
+                timeout=3
+            )
+            if result.returncode == 0:
+                return True
+            
+            # Try CT
+            result = subprocess.run(
+                ['pct', 'config', vmid],
+                capture_output=True,
+                text=True,
+                timeout=3
+            )
+            if result.returncode == 0:
+                return True
+            
+            return False
+        except Exception:
+            # On error, assume it exists to avoid false positives
+            return True
    
    def check_vm_running(self, vm_id: str) -> bool:
        """
@@ -28,7 +28,7 @@ from pathlib import Path

 # ─── Shared State for Cross-Watcher Coordination ──────────────────

-# ─── Startup Grace Period ─────────────────���──────────────────────────────────
+# ─── Startup Grace Period ────────────────────────────────────────────────────
 # Import centralized startup grace management
 # This provides a single source of truth for all grace period logic
 import startup_grace
@@ -2610,7 +2610,7 @@ class PollingCollector:
            pass


-# ─── Proxmox Webhook Receiver ───────────────────────────────────
+# ─── Proxmox Webhook Receiver ─────────────���─────────────────────

 class ProxmoxHookWatcher:
    """Receives native Proxmox VE notifications via local webhook endpoint.
@@ -44,6 +44,13 @@ from notification_events import (
    ProxmoxHookWatcher,
 )

+# AI context enrichment (uptime, frequency, SMART data, known errors)
+try:
+    from ai_context_enrichment import enrich_context_for_ai
+except ImportError:
+    def enrich_context_for_ai(title, body, event_type, data, journal_context='', detail_level='standard'):
+        return journal_context
+

 # ─── Constants ────────────────────────────────────────────────────

@@ -743,10 +750,10 @@ class NotificationManager:
            'ai_custom_prompt': self._config.get('ai_custom_prompt', ''),
        }
        
-        # Get journal context if available
-        journal_context = data.get('_journal_context', '')
-        
-        for ch_name, channel in channels.items():
+    # Get journal context if available (will be enriched per-channel based on detail_level)
+    raw_journal_context = data.get('_journal_context', '')
+    
+    for ch_name, channel in channels.items():
            # ── Per-channel category check ──
            # Default: category enabled (true) unless explicitly disabled.
            ch_group_key = f'{ch_name}.events.{event_group}'
@@ -771,17 +778,28 @@ class NotificationManager:
                rich_key = f'{ch_name}.rich_format'
                use_rich_format = self._config.get(rich_key, 'false') == 'true'
                
-                # ── Per-channel AI enhancement ──
-                # Apply AI with channel-specific detail level and emoji setting
-                # If AI is enabled AND rich_format is on, AI will include emojis directly
-                # Pass channel_type so AI knows whether to append original (email only)
-                channel_ai_config = {**ai_config, 'channel_type': ch_name}
-                ai_result = format_with_ai_full(
-                    ch_title, ch_body, severity, channel_ai_config,
-                    detail_level=detail_level,
-                    journal_context=journal_context,
-                    use_emojis=use_rich_format
-                )
+        # ── Per-channel AI enhancement ──
+        # Apply AI with channel-specific detail level and emoji setting
+        # If AI is enabled AND rich_format is on, AI will include emojis directly
+        # Pass channel_type so AI knows whether to append original (email only)
+        channel_ai_config = {**ai_config, 'channel_type': ch_name}
+        
+        # Enrich context with uptime, frequency, SMART data, and known errors
+        enriched_context = enrich_context_for_ai(
+            title=ch_title,
+            body=ch_body,
+            event_type=event_type,
+            data=data,
+            journal_context=raw_journal_context,
+            detail_level=detail_level
+        )
+        
+        ai_result = format_with_ai_full(
+        ch_title, ch_body, severity, channel_ai_config,
+        detail_level=detail_level,
+        journal_context=enriched_context,
+        use_emojis=use_rich_format
+        )
                ch_title = ai_result.get('title', ch_title)
                ch_body = ai_result.get('body', ch_body)
                
@@ -1384,7 +1384,13 @@ AI_DETAIL_TOKENS = {

 # System prompt template - optimized hybrid version
 AI_SYSTEM_PROMPT = """You are a notification FORMATTER for ProxMenux Monitor (Proxmox VE).
-Your job: translate and reformat alerts into {language}. You are NOT an analyst — do not interpret or diagnose.
+Your job: translate alerts into {language} and enrich them with context when provided.
+
+═══ ABSOLUTE CONSTRAINTS (NO EXCEPTIONS) ═══
+- NO HALLUCINATIONS: Do not invent causes, solutions, or facts not present in the provided data
+- NO SPECULATION: If something is unclear, state what IS known, not what MIGHT be
+- NO CONVERSATIONAL TEXT: Never write "Here is...", "I've translated...", "Let me explain..."
+- ONLY use information from: the message, journal context, and known error database (if provided)

 ═══ WHAT TO TRANSLATE ═══
 Translate: labels, descriptions, status words, units (GB→Go in French, etc.)
@@ -1394,15 +1400,37 @@ DO NOT translate: hostnames, IPs, paths, VM/CT IDs, device names (/dev/sdX), tec
 1. Plain text only — NO markdown, no **bold**, no `code`, no bullet lists (use "• " for packages only)
 2. Preserve severity: "failed" stays "failed", "warning" stays "warning" — never soften errors
 3. Preserve structure: keep same fields and line order, only translate content
-4. Detail level "{detail_level}": brief (2-3 lines) | standard (short paragraph) | detailed (full report)
+4. Detail level "{detail_level}":
+   - brief: 1-2 lines, essential facts only
+   - standard: short paragraph, key details and context
+   - detailed: full report with all available information, step-by-step if applicable
 5. DEDUPLICATION: merge duplicate facts from multiple sources into one clear statement
 6. EMPTY LISTS: write translated "none" after label, never leave blank
 7. Keep "hostname:" prefix in title — translate only the descriptive part
-8. DO NOT add recommendations or suggestions ("you should...", "try...", "consider...")
-{suggestions_addon}9. Present facts from message AND journal context — describe what happened, do NOT speculate
-10. OUTPUT ONLY the final result — no "Original:", no before/after comparisons
-11. Unknown input: preserve as closely as possible, translate what you can
+8. DO NOT add recommendations or suggestions UNLESS AI Suggestions mode is enabled below
+9. ENRICHED CONTEXT: You may receive additional context data including:
+   - "System uptime: X days (stable system)" → helps distinguish startup issues from runtime failures
+   - "Event frequency: N occurrences, first seen X ago" → indicates recurring vs one-time issues
+   - "SMART Health: PASSED/FAILED" with disk attributes → critical for disk errors
+   - "KNOWN PROXMOX ERROR DETECTED" with cause/solution → YOU MUST USE this exact information
+   
+   How to use enriched context:
+   - If uptime is <10min and error is service-related → mention "occurred shortly after boot"
+   - If frequency shows recurring pattern → mention "recurring issue (N times in X hours)"
+   - If SMART shows FAILED → treat as CRITICAL: "Disk failing - immediate attention required"
+   - If KNOWN ERROR is provided → YOU MUST incorporate its Cause and Solution (translate, don't copy verbatim)

+10. JOURNAL CONTEXT EXTRACTION: When journal logs are provided:
+   - Extract specific IDs (VM/CT numbers, disk devices, service names)
+   - Include relevant timestamps if they help explain the timeline
+   - Identify root cause when logs clearly show it (e.g., "exit-code 255" -> "process crashed")
+   - Translate technical terms: "Emask 0x10" -> "ATA bus error", "DRDY ERR" -> "drive not ready"
+   - If logs show the same error repeating, state frequency: "occurred 15 times in 10 minutes"
+   - IGNORE journal entries unrelated to the main event
+11. OUTPUT ONLY the final result — no "Original:", no before/after comparisons
+12. Unknown input: preserve as closely as possible, translate what you can
+13. REDUNDANCY: Never repeat the same information twice. If title says "CT 103 failed", body should not start with "Container 103 failed"
+{suggestions_addon}
 ═══ PROXMOX MAPPINGS (use directly, never explain) ═══
 pve-container@XXXX → "CT XXXX" | qemu-server@XXXX → "VM XXXX" | vzdump → "backup"
 pveproxy/pvedaemon/pvestatd → "Proxmox service" | corosync → "cluster service"
@@ -1457,18 +1485,17 @@ CORRECT (markers are separators only):

 # Addon for experimental suggestions mode
 AI_SUGGESTIONS_ADDON = """
-   EXCEPTION TO RULE 8 (Suggestions enabled): When journal context shows a clear, actionable problem,
-   you MAY add ONE brief suggestion at the END of the body (after all facts), using this format:
-   
-   💡 Tip: [your suggestion here]
-   
-   Guidelines for suggestions:
-   - Only suggest when the problem AND solution are clear from the logs
-   - Keep it to ONE line, max 100 characters
-   - Be specific: "Check disk /dev/sdb SMART status" not "Check your disks"
-   - Use commands when helpful: "Run 'systemctl restart pvedaemon'"
-   - Never speculate - only suggest based on evidence in the logs
-   - Skip the tip entirely if the problem is unclear or already resolved
+═══ AI SUGGESTIONS MODE (ENABLED) ═══
+You MAY add ONE brief, actionable tip at the END of the body using this exact format:
+
+💡 Tip: [your concise suggestion here]
+
+Rules for the tip:
+- ONLY include if the log context or Known Error database clearly points to a specific fix
+- Keep under 100 characters
+- Be specific: "Run 'pvecm status' to check quorum" NOT "Check cluster status"
+- If Known Error provides a solution, YOU MUST USE IT (don't invent your own)
+- Never guess — skip the tip if the cause/solution is unclear
 """

 # Emoji instructions injected into AI_SYSTEM_PROMPT for rich channels (Telegram, Discord, Pushover)
@@ -0,0 +1,348 @@
+#!/usr/bin/env python3
+"""
+Database of known Proxmox/Linux errors with causes, solutions, and severity levels.
+
+This provides the AI with accurate, pre-verified information about common errors,
+reducing hallucinations and ensuring consistent, helpful responses.
+
+Each entry includes:
+- pattern: regex pattern to match against error messages/logs
+- cause: brief explanation of what causes this error
+- cause_detailed: more comprehensive explanation for detailed mode
+- severity: info, warning, critical
+- solution: brief actionable solution
+- solution_detailed: step-by-step solution for detailed mode
+- url: optional documentation link
+"""
+
+import re
+from typing import Optional, Dict, Any, List
+
+# Known error patterns with causes and solutions
+PROXMOX_KNOWN_ERRORS: List[Dict[str, Any]] = [
+    # ==================== SUBSCRIPTION/LICENSE ====================
+    {
+        "pattern": r"no valid subscription|subscription.*invalid|not subscribed",
+        "cause": "Proxmox enterprise repository requires paid subscription",
+        "cause_detailed": "Proxmox VE uses a subscription model for enterprise features. Without a valid subscription key, access to the enterprise repository is denied. This is normal for home/lab users.",
+        "severity": "info",
+        "solution": "Use no-subscription repository or purchase subscription",
+        "solution_detailed": "For home/lab use: Switch to the no-subscription repository by editing /etc/apt/sources.list.d/pve-enterprise.list. For production: Purchase a subscription at proxmox.com/pricing",
+        "url": "https://pve.proxmox.com/wiki/Package_Repositories",
+        "category": "updates"
+    },
+    
+    # ==================== CLUSTER/COROSYNC ====================
+    {
+        "pattern": r"quorum.*lost|lost.*quorum|not.*quorate",
+        "cause": "Cluster lost majority of voting nodes",
+        "cause_detailed": "Corosync cluster requires more than 50% of configured votes to maintain quorum. When quorum is lost, the cluster becomes read-only to prevent split-brain scenarios.",
+        "severity": "critical",
+        "solution": "Check network connectivity between nodes; ensure majority of nodes are online",
+        "solution_detailed": "1. Verify network connectivity: ping all cluster nodes\n2. Check corosync status: systemctl status corosync\n3. View cluster status: pvecm status\n4. If nodes are unreachable, check firewall rules (ports 5405-5412 UDP)\n5. For emergency single-node operation: pvecm expected 1",
+        "url": "https://pve.proxmox.com/wiki/Cluster_Manager",
+        "category": "cluster"
+    },
+    {
+        "pattern": r"corosync.*qdevice.*error|qdevice.*connection.*failed|qdevice.*not.*connected",
+        "cause": "QDevice helper node is unreachable",
+        "cause_detailed": "The Corosync QDevice provides an additional vote for 2-node clusters. When it cannot connect, the cluster may lose quorum if one node fails.",
+        "severity": "warning",
+        "solution": "Check QDevice server connectivity and corosync-qnetd service",
+        "solution_detailed": "1. Verify QDevice server is running: systemctl status corosync-qnetd (on QDevice host)\n2. Check connectivity: nc -zv <qdevice-ip> 5403\n3. Restart qdevice: systemctl restart corosync-qdevice\n4. Check certificates: corosync-qdevice-net-certutil -s",
+        "url": "https://pve.proxmox.com/wiki/Cluster_Manager#_corosync_external_vote_support",
+        "category": "cluster"
+    },
+    {
+        "pattern": r"corosync.*retransmit|corosync.*token.*timeout|ring.*mark.*faulty",
+        "cause": "Network latency or packet loss between cluster nodes",
+        "cause_detailed": "Corosync uses multicast/unicast for cluster communication. High latency, packet loss, or network congestion causes token timeouts and retransmissions, potentially leading to node eviction.",
+        "severity": "warning",
+        "solution": "Check network quality between nodes; consider increasing token timeout",
+        "solution_detailed": "1. Test network latency: ping -c 100 <other-node>\n2. Check for packet loss between nodes\n3. Verify MTU settings match on all interfaces\n4. Increase token timeout in /etc/pve/corosync.conf if needed (default 1000ms)\n5. Check switch/router for congestion",
+        "category": "cluster"
+    },
+    
+    # ==================== DISK/STORAGE ====================
+    {
+        "pattern": r"SMART.*FAILED|smart.*failed.*health|Pre-fail|Old_age.*FAILING",
+        "cause": "Disk SMART health check failed - disk is failing",
+        "cause_detailed": "SMART (Self-Monitoring, Analysis and Reporting Technology) detected critical disk health issues. The disk is likely failing and data loss is imminent.",
+        "severity": "critical",
+        "solution": "IMMEDIATELY backup data and replace disk",
+        "solution_detailed": "1. URGENT: Backup all data from this disk immediately\n2. Check SMART details: smartctl -a /dev/sdX\n3. Note the failing attributes (Reallocated_Sector_Ct, Current_Pending_Sector, etc.)\n4. Plan disk replacement\n5. If in RAID/ZFS: initiate disk replacement procedure",
+        "category": "disks"
+    },
+    {
+        "pattern": r"Reallocated_Sector_Ct.*threshold|reallocated.*sectors?.*exceeded",
+        "cause": "Disk has excessive bad sectors being remapped",
+        "cause_detailed": "The disk firmware has remapped multiple bad sectors to spare areas. While the disk is still functioning, this indicates physical degradation and eventual failure.",
+        "severity": "warning",
+        "solution": "Monitor closely and plan disk replacement",
+        "solution_detailed": "1. Check current value: smartctl -A /dev/sdX | grep Reallocated\n2. If value is increasing, plan immediate replacement\n3. Backup important data\n4. Run extended SMART test: smartctl -t long /dev/sdX",
+        "category": "disks"
+    },
+    {
+        "pattern": r"ata.*error|ATA.*bus.*error|Emask.*0x|DRDY.*ERR|UNC.*error",
+        "cause": "ATA communication error with disk",
+        "cause_detailed": "The SATA/ATA controller encountered communication errors with the disk. This can indicate cable issues, controller problems, or disk failure.",
+        "severity": "warning",
+        "solution": "Check SATA cables and connections; verify disk health with smartctl",
+        "solution_detailed": "1. Check SMART health: smartctl -H /dev/sdX\n2. Inspect and reseat SATA cables\n3. Try different SATA port\n4. Check dmesg for pattern of errors\n5. If errors persist, disk may be failing",
+        "category": "disks"
+    },
+    {
+        "pattern": r"I/O.*error|blk_update_request.*error|Buffer I/O error",
+        "cause": "Disk I/O operation failed",
+        "cause_detailed": "The kernel failed to read or write data to the disk. This can be caused by disk failure, cable issues, or filesystem corruption.",
+        "severity": "critical",
+        "solution": "Check disk health and connections immediately",
+        "solution_detailed": "1. Check SMART status: smartctl -H /dev/sdX\n2. Check dmesg for related errors: dmesg | grep -i error\n3. Verify disk is still accessible: lsblk\n4. If ZFS: check pool status with zpool status\n5. Consider filesystem check if safe to unmount",
+        "category": "disks"
+    },
+    {
+        "pattern": r"zfs.*pool.*DEGRADED|pool.*is.*degraded",
+        "cause": "ZFS pool has reduced redundancy",
+        "cause_detailed": "One or more devices in the ZFS pool are unavailable or experiencing errors. The pool is still functional but without full redundancy.",
+        "severity": "warning",
+        "solution": "Identify failed device with 'zpool status' and replace",
+        "solution_detailed": "1. Check pool status: zpool status <pool>\n2. Identify the DEGRADED or UNAVAIL device\n3. If device is present but erroring: zpool scrub <pool>\n4. To replace: zpool replace <pool> <old-device> <new-device>\n5. Monitor resilver progress: zpool status",
+        "category": "storage"
+    },
+    {
+        "pattern": r"zfs.*pool.*FAULTED|pool.*is.*faulted",
+        "cause": "ZFS pool is inaccessible",
+        "cause_detailed": "The ZFS pool has lost too many devices and cannot maintain data integrity. Data may be inaccessible.",
+        "severity": "critical",
+        "solution": "Check failed devices; may need data recovery",
+        "solution_detailed": "1. Check status: zpool status <pool>\n2. Identify all failed devices\n3. Attempt to online devices: zpool online <pool> <device>\n4. If drives are physically present, try zpool clear <pool>\n5. May require data recovery if multiple drives failed",
+        "category": "storage"
+    },
+    
+    # ==================== CEPH ====================
+    {
+        "pattern": r"ceph.*OSD.*down|osd\.\d+.*down|ceph.*osd.*failed",
+        "cause": "Ceph OSD daemon is not running",
+        "cause_detailed": "A Ceph Object Storage Daemon (OSD) has stopped or crashed. This reduces storage redundancy and may trigger data rebalancing.",
+        "severity": "warning",
+        "solution": "Check disk health and restart OSD service",
+        "solution_detailed": "1. Check OSD status: ceph osd tree\n2. View OSD logs: journalctl -u ceph-osd@<id>\n3. Check underlying disk: smartctl -H /dev/sdX\n4. Restart OSD: systemctl start ceph-osd@<id>\n5. If OSD keeps crashing, check for disk failure",
+        "category": "storage"
+    },
+    {
+        "pattern": r"ceph.*health.*WARN|HEALTH_WARN",
+        "cause": "Ceph cluster has warnings",
+        "cause_detailed": "Ceph detected issues that don't prevent operation but should be addressed. Common causes: degraded PGs, clock skew, full OSDs.",
+        "severity": "warning",
+        "solution": "Run 'ceph health detail' for specific issues",
+        "solution_detailed": "1. Get details: ceph health detail\n2. Common fixes:\n   - Degraded PGs: wait for recovery or add capacity\n   - Clock skew: sync NTP on all nodes\n   - Full OSDs: add storage or delete data\n3. Check: ceph status",
+        "category": "storage"
+    },
+    {
+        "pattern": r"ceph.*health.*ERR|HEALTH_ERR",
+        "cause": "Ceph cluster has critical errors",
+        "cause_detailed": "Ceph has detected critical issues that may affect data availability or integrity. Immediate attention required.",
+        "severity": "critical",
+        "solution": "Run 'ceph health detail' and address errors immediately",
+        "solution_detailed": "1. Get details: ceph health detail\n2. Check OSD status: ceph osd tree\n3. Check MON status: ceph mon stat\n4. View PG status: ceph pg stat\n5. Address each error shown in health detail",
+        "category": "storage"
+    },
+    
+    # ==================== VM/CT ERRORS ====================
+    {
+        "pattern": r"TASK ERROR.*failed to get exclusive lock|lock.*timeout|couldn't acquire lock",
+        "cause": "Resource is locked by another operation",
+        "cause_detailed": "Another task is currently holding a lock on this VM/CT. This prevents concurrent modifications that could cause corruption.",
+        "severity": "info",
+        "solution": "Wait for other task to complete or check for stuck tasks",
+        "solution_detailed": "1. Check running tasks: cat /var/log/pve/tasks/active\n2. Wait for task completion\n3. If task is stuck (>1h), check process: ps aux | grep <vmid>\n4. As last resort, remove lock file: rm /var/lock/qemu-server/lock-<vmid>.conf",
+        "category": "vms"
+    },
+    {
+        "pattern": r"kvm.*not.*available|kvm.*disabled|hardware.*virtualization.*disabled",
+        "cause": "KVM/hardware virtualization not available",
+        "cause_detailed": "The CPU's hardware virtualization extensions (Intel VT-x or AMD-V) are either not supported, not enabled in BIOS, or blocked by another hypervisor.",
+        "severity": "warning",
+        "solution": "Enable VT-x/AMD-V in BIOS settings",
+        "solution_detailed": "1. Reboot into BIOS/UEFI\n2. Find Virtualization settings (often in CPU or Advanced section)\n3. Enable Intel VT-x or AMD-V/SVM\n4. Save and reboot\n5. Verify: grep -E 'vmx|svm' /proc/cpuinfo",
+        "category": "vms"
+    },
+    {
+        "pattern": r"out of memory|OOM.*kill|cannot allocate memory|memory.*exhausted",
+        "cause": "System or VM ran out of memory",
+        "cause_detailed": "The Linux OOM (Out Of Memory) killer terminated a process to free memory. This indicates memory pressure from overcommitment or memory leaks.",
+        "severity": "critical",
+        "solution": "Increase memory allocation or reduce VM memory usage",
+        "solution_detailed": "1. Check what was killed: dmesg | grep -i oom\n2. Review memory usage: free -h\n3. Check balloon driver status for VMs\n4. Consider adding swap or RAM\n5. Review VM memory allocations for overcommitment",
+        "category": "memory"
+    },
+    
+    # ==================== NETWORK ====================
+    {
+        "pattern": r"bond.*slave.*link.*down|bond.*no.*active.*slave",
+        "cause": "Network bond lost a slave interface",
+        "cause_detailed": "One or more physical interfaces in a network bond have lost link. Depending on bond mode, this may reduce bandwidth or affect failover.",
+        "severity": "warning",
+        "solution": "Check physical cable connections and switch ports",
+        "solution_detailed": "1. Check bond status: cat /proc/net/bonding/bond0\n2. Identify down slave interface\n3. Check physical cable connection\n4. Check switch port status and errors\n5. Verify interface: ethtool <slave-iface>",
+        "category": "network"
+    },
+    {
+        "pattern": r"link.*not.*ready|carrier.*lost|link.*down|NIC.*Link.*Down",
+        "cause": "Network interface lost link",
+        "cause_detailed": "The physical or virtual network interface has lost its connection. This could be a cable issue, switch problem, or driver issue.",
+        "severity": "warning",
+        "solution": "Check cable, switch port, and interface status",
+        "solution_detailed": "1. Check interface: ip link show <iface>\n2. Check cable connection\n3. Check switch port LEDs\n4. Try: ip link set <iface> down && ip link set <iface> up\n5. Check driver: ethtool -i <iface>",
+        "category": "network"
+    },
+    {
+        "pattern": r"bridge.*STP.*blocked|spanning.*tree.*blocked",
+        "cause": "Spanning Tree Protocol blocked a port",
+        "cause_detailed": "STP detected a potential network loop and blocked a bridge port to prevent broadcast storms. This is normal behavior but may indicate network topology issues.",
+        "severity": "info",
+        "solution": "Review network topology; this may be expected behavior",
+        "solution_detailed": "1. Check bridge status: brctl show\n2. View STP state: brctl showstp <bridge>\n3. If unexpected, review network topology for loops\n4. Consider disabling STP if network is simple: brctl stp <bridge> off",
+        "category": "network"
+    },
+    
+    # ==================== SERVICES ====================
+    {
+        "pattern": r"pvedaemon.*failed|pveproxy.*failed|pvestatd.*failed",
+        "cause": "Critical Proxmox service failed",
+        "cause_detailed": "One of the core Proxmox daemons has crashed or failed to start. This may affect web GUI access or API functionality.",
+        "severity": "critical",
+        "solution": "Restart the failed service; check logs for cause",
+        "solution_detailed": "1. Check status: systemctl status <service>\n2. View logs: journalctl -u <service> -n 50\n3. Restart: systemctl restart <service>\n4. If persistent, check: /var/log/pveproxy/access.log",
+        "category": "pve_services"
+    },
+    {
+        "pattern": r"failed to start.*service|service.*start.*failed|service.*activation.*failed",
+        "cause": "System service failed to start",
+        "cause_detailed": "A systemd service unit failed during startup. This could be due to configuration errors, missing dependencies, or resource issues.",
+        "severity": "warning",
+        "solution": "Check service logs with journalctl -u <service>",
+        "solution_detailed": "1. Check status: systemctl status <service>\n2. View logs: journalctl -xeu <service>\n3. Check config: systemctl cat <service>\n4. Verify dependencies: systemctl list-dependencies <service>\n5. Try restart: systemctl restart <service>",
+        "category": "services"
+    },
+    
+    # ==================== BACKUP ====================
+    {
+        "pattern": r"backup.*failed|vzdump.*error|backup.*job.*failed",
+        "cause": "Backup job failed",
+        "cause_detailed": "A scheduled or manual backup operation failed. Common causes: storage full, VM locked, network issues for remote storage.",
+        "severity": "warning",
+        "solution": "Check backup storage space and VM status",
+        "solution_detailed": "1. Check backup log in Datacenter > Backup\n2. Verify storage space: df -h\n3. Check if VM is locked: qm list or pct list\n4. Verify backup storage is accessible\n5. Try manual backup to identify specific error",
+        "category": "backups"
+    },
+    
+    # ==================== CERTIFICATES ====================
+    {
+        "pattern": r"certificate.*expired|SSL.*certificate.*expired|cert.*expir",
+        "cause": "SSL/TLS certificate has expired",
+        "cause_detailed": "An SSL certificate used for secure communication has passed its expiration date. This may cause connection failures or security warnings.",
+        "severity": "warning",
+        "solution": "Renew the certificate using pvenode cert set or Let's Encrypt",
+        "solution_detailed": "1. Check certificate: pvenode cert info\n2. For self-signed renewal: pvecm updatecerts\n3. For Let's Encrypt: pvenode acme cert order\n4. Restart pveproxy after renewal: systemctl restart pveproxy",
+        "url": "https://pve.proxmox.com/wiki/Certificate_Management",
+        "category": "security"
+    },
+    
+    # ==================== HARDWARE/TEMPERATURE ====================
+    {
+        "pattern": r"temperature.*critical|thermal.*critical|CPU.*overheating|temp.*above.*threshold",
+        "cause": "Component temperature critical",
+        "cause_detailed": "A hardware component (CPU, disk, etc.) has reached a dangerous temperature. Sustained high temperatures can cause hardware damage or system shutdowns.",
+        "severity": "critical",
+        "solution": "Check cooling system immediately; clean dust, verify fans",
+        "solution_detailed": "1. Check current temps: sensors\n2. Verify all fans are running\n3. Clean dust from heatsinks and filters\n4. Ensure adequate airflow\n5. Consider reapplying thermal paste if CPU\n6. Check ambient room temperature",
+        "category": "temperature"
+    },
+    
+    # ==================== AUTHENTICATION ====================
+    {
+        "pattern": r"authentication.*failed|login.*failed|invalid.*credentials|access.*denied",
+        "cause": "Authentication failure",
+        "cause_detailed": "A login attempt failed due to invalid credentials or permissions. Multiple failures may indicate a brute-force attack.",
+        "severity": "info",
+        "solution": "Verify credentials; check for unauthorized access attempts",
+        "solution_detailed": "1. Review auth logs: journalctl -u pvedaemon | grep auth\n2. Check for multiple failures from same IP\n3. Verify user exists: pveum user list\n4. If attack suspected, consider fail2ban\n5. Reset password if needed: pveum passwd <user>",
+        "category": "security"
+    },
+]
+
+
+def find_matching_error(text: str, category: Optional[str] = None) -> Optional[Dict[str, Any]]:
+    """Find a known error that matches the given text.
+    
+    Args:
+        text: Error message or log content to match against
+        category: Optional category to filter by
+        
+    Returns:
+        Matching error dict or None
+    """
+    if not text:
+        return None
+    
+    text_lower = text.lower()
+    
+    for error in PROXMOX_KNOWN_ERRORS:
+        # Filter by category if specified
+        if category and error.get("category") != category:
+            continue
+            
+        try:
+            if re.search(error["pattern"], text_lower, re.IGNORECASE):
+                return error
+        except re.error:
+            continue
+    
+    return None
+
+
+def get_error_context(text: str, category: Optional[str] = None, detail_level: str = "standard") -> Optional[str]:
+    """Get formatted context for a known error.
+    
+    Args:
+        text: Error message to match
+        category: Optional category filter
+        detail_level: "minimal", "standard", or "detailed"
+        
+    Returns:
+        Formatted context string or None
+    """
+    error = find_matching_error(text, category)
+    if not error:
+        return None
+    
+    if detail_level == "minimal":
+        return f"Known issue: {error['cause']}"
+    
+    elif detail_level == "standard":
+        lines = [
+            f"KNOWN PROXMOX ERROR DETECTED:",
+            f"  Cause: {error['cause']}",
+            f"  Severity: {error['severity'].upper()}",
+            f"  Solution: {error['solution']}"
+        ]
+        if error.get("url"):
+            lines.append(f"  Docs: {error['url']}")
+        return "\n".join(lines)
+    
+    else:  # detailed
+        lines = [
+            f"KNOWN PROXMOX ERROR DETECTED:",
+            f"  Cause: {error.get('cause_detailed', error['cause'])}",
+            f"  Severity: {error['severity'].upper()}",
+            f"  Solution: {error.get('solution_detailed', error['solution'])}"
+        ]
+        if error.get("url"):
+            lines.append(f"  Documentation: {error['url']}")
+        return "\n".join(lines)
+
+
+def get_all_patterns() -> List[str]:
+    """Get all error patterns for external use."""
+    return [error["pattern"] for error in PROXMOX_KNOWN_ERRORS]