Update notification service

This commit is contained in:
MacRimi
2026-03-30 18:53:03 +02:00
parent cb9a43f496
commit 54eab9af49
7 changed files with 1106 additions and 35 deletions
+375
View File
@@ -0,0 +1,375 @@
#!/usr/bin/env python3
"""
AI Context Enrichment Module
Enriches notification context with additional information to help AI provide
more accurate and helpful responses:
1. Event frequency - how often this error has occurred
2. System uptime - helps distinguish startup issues from runtime failures
3. SMART disk data - for disk-related errors
4. Known error matching - from proxmox_known_errors database
Author: MacRimi
"""
import os
import re
import subprocess
from datetime import datetime, timedelta
from typing import Optional, Dict, Any
import sqlite3
from pathlib import Path
# Import known errors database
try:
from proxmox_known_errors import get_error_context, find_matching_error
except ImportError:
def get_error_context(*args, **kwargs):
return None
def find_matching_error(*args, **kwargs):
return None
DB_PATH = Path('/usr/local/share/proxmenux/health_monitor.db')
def get_system_uptime() -> str:
"""Get system uptime in human-readable format.
Returns:
String like "2 minutes (recently booted)" or "89 days, 4 hours (stable system)"
"""
try:
with open('/proc/uptime', 'r') as f:
uptime_seconds = float(f.readline().split()[0])
days = int(uptime_seconds // 86400)
hours = int((uptime_seconds % 86400) // 3600)
minutes = int((uptime_seconds % 3600) // 60)
# Build human-readable string
parts = []
if days > 0:
parts.append(f"{days} day{'s' if days != 1 else ''}")
if hours > 0:
parts.append(f"{hours} hour{'s' if hours != 1 else ''}")
if not parts: # Less than an hour
parts.append(f"{minutes} minute{'s' if minutes != 1 else ''}")
uptime_str = ", ".join(parts)
# Add context hint
if uptime_seconds < 600: # Less than 10 minutes
return f"{uptime_str} (just booted - likely startup issue)"
elif uptime_seconds < 3600: # Less than 1 hour
return f"{uptime_str} (recently booted)"
elif days >= 30:
return f"{uptime_str} (stable system)"
else:
return uptime_str
except Exception:
return "unknown"
def get_event_frequency(error_id: str = None, error_key: str = None,
category: str = None, hours: int = 24) -> Optional[Dict[str, Any]]:
"""Get frequency information for an error from the database.
Args:
error_id: Specific error ID to look up
error_key: Alternative error key
category: Error category
hours: Time window to check (default 24h)
Returns:
Dict with frequency info or None
"""
if not DB_PATH.exists():
return None
try:
conn = sqlite3.connect(str(DB_PATH), timeout=5)
cursor = conn.cursor()
# Try to find the error
if error_id:
cursor.execute('''
SELECT first_seen, last_seen, occurrences, category
FROM errors WHERE error_key = ? OR error_id = ?
ORDER BY last_seen DESC LIMIT 1
''', (error_id, error_id))
elif error_key:
cursor.execute('''
SELECT first_seen, last_seen, occurrences, category
FROM errors WHERE error_key = ?
ORDER BY last_seen DESC LIMIT 1
''', (error_key,))
elif category:
cursor.execute('''
SELECT first_seen, last_seen, occurrences, category
FROM errors WHERE category = ? AND resolved_at IS NULL
ORDER BY last_seen DESC LIMIT 1
''', (category,))
else:
conn.close()
return None
row = cursor.fetchone()
conn.close()
if not row:
return None
first_seen, last_seen, occurrences, cat = row
# Calculate age
try:
first_dt = datetime.fromisoformat(first_seen) if first_seen else None
last_dt = datetime.fromisoformat(last_seen) if last_seen else None
now = datetime.now()
result = {
'occurrences': occurrences or 1,
'category': cat
}
if first_dt:
age = now - first_dt
if age.total_seconds() < 3600:
result['first_seen_ago'] = f"{int(age.total_seconds() / 60)} minutes ago"
elif age.total_seconds() < 86400:
result['first_seen_ago'] = f"{int(age.total_seconds() / 3600)} hours ago"
else:
result['first_seen_ago'] = f"{age.days} days ago"
if last_dt and first_dt and occurrences and occurrences > 1:
# Calculate average interval
span = (last_dt - first_dt).total_seconds()
if span > 0 and occurrences > 1:
avg_interval = span / (occurrences - 1)
if avg_interval < 60:
result['pattern'] = f"recurring every ~{int(avg_interval)} seconds"
elif avg_interval < 3600:
result['pattern'] = f"recurring every ~{int(avg_interval / 60)} minutes"
else:
result['pattern'] = f"recurring every ~{int(avg_interval / 3600)} hours"
return result
except (ValueError, TypeError):
return {'occurrences': occurrences or 1, 'category': cat}
except Exception as e:
print(f"[AIContext] Error getting frequency: {e}")
return None
def get_smart_data(disk_device: str) -> Optional[str]:
"""Get SMART health data for a disk.
Args:
disk_device: Device path like /dev/sda or just sda
Returns:
Formatted SMART summary or None
"""
if not disk_device:
return None
# Normalize device path
if not disk_device.startswith('/dev/'):
disk_device = f'/dev/{disk_device}'
# Check device exists
if not os.path.exists(disk_device):
return None
try:
# Get health status
result = subprocess.run(
['smartctl', '-H', disk_device],
capture_output=True, text=True, timeout=10
)
health_status = "UNKNOWN"
if "PASSED" in result.stdout:
health_status = "PASSED"
elif "FAILED" in result.stdout:
health_status = "FAILED"
# Get key attributes
result = subprocess.run(
['smartctl', '-A', disk_device],
capture_output=True, text=True, timeout=10
)
attributes = {}
critical_attrs = [
'Reallocated_Sector_Ct', 'Current_Pending_Sector',
'Offline_Uncorrectable', 'UDMA_CRC_Error_Count',
'Reallocated_Event_Count', 'Reported_Uncorrect'
]
for line in result.stdout.split('\n'):
for attr in critical_attrs:
if attr in line:
parts = line.split()
# Typical format: ID ATTRIBUTE_NAME FLAGS VALUE WORST THRESH TYPE UPDATED RAW_VALUE
if len(parts) >= 10:
raw_value = parts[-1]
attributes[attr] = raw_value
# Build summary
lines = [f"SMART Health: {health_status}"]
# Add critical attributes if non-zero
for attr, value in attributes.items():
try:
if int(value) > 0:
lines.append(f" {attr}: {value}")
except ValueError:
pass
return "\n".join(lines) if len(lines) > 1 or health_status == "FAILED" else f"SMART Health: {health_status}"
except subprocess.TimeoutExpired:
return None
except FileNotFoundError:
# smartctl not installed
return None
except Exception:
return None
def extract_disk_device(text: str) -> Optional[str]:
"""Extract disk device name from error text.
Args:
text: Error message or log content
Returns:
Device name like 'sda' or None
"""
if not text:
return None
# Common patterns for disk devices in errors
patterns = [
r'/dev/(sd[a-z]\d*)',
r'/dev/(nvme\d+n\d+(?:p\d+)?)',
r'/dev/(hd[a-z]\d*)',
r'/dev/(vd[a-z]\d*)',
r'\b(sd[a-z])\b',
r'disk[_\s]+(sd[a-z])',
r'ata\d+\.\d+: (sd[a-z])',
]
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group(1)
return None
def enrich_context_for_ai(
title: str,
body: str,
event_type: str,
data: Dict[str, Any],
journal_context: str = '',
detail_level: str = 'standard'
) -> str:
"""Build enriched context string for AI processing.
Combines:
- Original journal context
- Event frequency information
- System uptime
- SMART data (for disk errors)
- Known error matching
Args:
title: Notification title
body: Notification body
event_type: Type of event
data: Event data dict
journal_context: Original journal log context
detail_level: Level of detail (minimal, standard, detailed)
Returns:
Enriched context string
"""
context_parts = []
combined_text = f"{title} {body} {journal_context}"
# 1. System uptime (always useful)
uptime = get_system_uptime()
if uptime and uptime != "unknown":
context_parts.append(f"System uptime: {uptime}")
# 2. Event frequency
error_key = data.get('error_key') or data.get('error_id')
category = data.get('category')
freq = get_event_frequency(error_id=error_key, category=category)
if freq:
freq_line = f"Event frequency: {freq.get('occurrences', 1)} occurrence(s)"
if freq.get('first_seen_ago'):
freq_line += f", first seen {freq['first_seen_ago']}"
if freq.get('pattern'):
freq_line += f", {freq['pattern']}"
context_parts.append(freq_line)
# 3. SMART data for disk-related events
disk_related = any(x in event_type.lower() for x in ['disk', 'smart', 'storage', 'io_error'])
if not disk_related:
disk_related = any(x in combined_text.lower() for x in ['disk', 'smart', '/dev/sd', 'ata', 'i/o error'])
if disk_related:
disk_device = extract_disk_device(combined_text)
if disk_device:
smart_data = get_smart_data(disk_device)
if smart_data:
context_parts.append(smart_data)
# 4. Known error matching
known_error_ctx = get_error_context(combined_text, category=category, detail_level=detail_level)
if known_error_ctx:
context_parts.append(known_error_ctx)
# 5. Add original journal context
if journal_context:
context_parts.append(f"Journal logs:\n{journal_context}")
# Combine all parts
if context_parts:
return "\n\n".join(context_parts)
return journal_context or ""
def get_enriched_context(
event: 'NotificationEvent',
detail_level: str = 'standard'
) -> str:
"""Convenience function to enrich context from a NotificationEvent.
Args:
event: NotificationEvent object
detail_level: Level of detail
Returns:
Enriched context string
"""
journal_context = event.data.get('_journal_context', '')
return enrich_context_for_ai(
title=event.data.get('title', ''),
body=event.data.get('body', event.data.get('message', '')),
event_type=event.event_type,
data=event.data,
journal_context=journal_context,
detail_level=detail_level
)
+2
View File
@@ -95,6 +95,8 @@ cp "$SCRIPT_DIR/notification_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo
cp "$SCRIPT_DIR/notification_channels.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_channels.py not found"
cp "$SCRIPT_DIR/notification_templates.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_templates.py not found"
cp "$SCRIPT_DIR/notification_events.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ notification_events.py not found"
cp "$SCRIPT_DIR/proxmox_known_errors.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ proxmox_known_errors.py not found"
cp "$SCRIPT_DIR/ai_context_enrichment.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ ai_context_enrichment.py not found"
cp "$SCRIPT_DIR/startup_grace.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ startup_grace.py not found"
cp "$SCRIPT_DIR/flask_notification_routes.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ flask_notification_routes.py not found"
cp "$SCRIPT_DIR/oci_manager.py" "$APP_DIR/usr/bin/" 2>/dev/null || echo "⚠️ oci_manager.py not found"
+301
View File
@@ -862,6 +862,307 @@ class HealthPersistence:
conn.commit()
conn.close()
# Clean up errors for resources that no longer exist (VMs/CTs deleted, disks removed)
self._cleanup_stale_resources()
def _cleanup_stale_resources(self):
"""Resolve errors for resources that no longer exist.
Comprehensive cleanup for ALL error categories:
- VMs/CTs: deleted resources (not just stopped)
- Disks: physically removed devices, ZFS pools, storage
- Network: removed interfaces, bonds, bridges
- Services/pve_services: services on deleted CTs, stopped services
- Logs: persistent/spike/cascade errors older than 48h
- Cluster: errors when node is no longer in cluster
- Temperature: sensors that no longer exist
- Memory/Storage: mount points that no longer exist
- Updates/Security: acknowledged errors older than 7 days
- General fallback: any error older than 7 days with no recent activity
"""
import subprocess
import re
conn = self._get_conn()
cursor = conn.cursor()
now = datetime.now()
now_iso = now.isoformat()
# Get all active (unresolved) errors with first_seen and last_seen for age checks
cursor.execute('''
SELECT id, error_key, category, message, first_seen, last_seen, severity FROM errors
WHERE resolved_at IS NULL
''')
active_errors = cursor.fetchall()
resolved_count = 0
# Cache for expensive checks (avoid repeated subprocess calls)
_vm_ct_exists_cache = {}
_cluster_status_cache = None
_network_interfaces_cache = None
_zfs_pools_cache = None
_mount_points_cache = None
_pve_services_cache = None
def check_vm_ct_cached(vmid):
if vmid not in _vm_ct_exists_cache:
_vm_ct_exists_cache[vmid] = self._check_vm_ct_exists(vmid)
return _vm_ct_exists_cache[vmid]
def get_cluster_status():
nonlocal _cluster_status_cache
if _cluster_status_cache is None:
try:
result = subprocess.run(
['pvecm', 'status'],
capture_output=True, text=True, timeout=5
)
_cluster_status_cache = {
'is_cluster': result.returncode == 0 and 'Cluster information' in result.stdout,
'nodes': result.stdout if result.returncode == 0 else ''
}
except Exception:
_cluster_status_cache = {'is_cluster': True, 'nodes': ''} # Assume cluster on error
return _cluster_status_cache
def get_network_interfaces():
nonlocal _network_interfaces_cache
if _network_interfaces_cache is None:
try:
import psutil
_network_interfaces_cache = set(psutil.net_if_stats().keys())
except Exception:
_network_interfaces_cache = set()
return _network_interfaces_cache
def get_zfs_pools():
nonlocal _zfs_pools_cache
if _zfs_pools_cache is None:
try:
result = subprocess.run(
['zpool', 'list', '-H', '-o', 'name'],
capture_output=True, text=True, timeout=5
)
if result.returncode == 0:
_zfs_pools_cache = set(result.stdout.strip().split('\n'))
else:
_zfs_pools_cache = set()
except Exception:
_zfs_pools_cache = set()
return _zfs_pools_cache
def get_mount_points():
nonlocal _mount_points_cache
if _mount_points_cache is None:
try:
import psutil
_mount_points_cache = set(p.mountpoint for p in psutil.disk_partitions(all=True))
except Exception:
_mount_points_cache = set()
return _mount_points_cache
def get_pve_services_status():
nonlocal _pve_services_cache
if _pve_services_cache is None:
_pve_services_cache = {}
try:
result = subprocess.run(
['systemctl', 'list-units', '--type=service', '--all', '--no-legend'],
capture_output=True, text=True, timeout=10
)
if result.returncode == 0:
for line in result.stdout.strip().split('\n'):
parts = line.split()
if parts:
service_name = parts[0].replace('.service', '')
_pve_services_cache[service_name] = 'active' in line
except Exception:
pass
return _pve_services_cache
def extract_vmid_from_text(text):
"""Extract VM/CT ID from error message or key."""
if not text:
return None
# Patterns: "VM 100", "CT 100", "vm_100_", "ct_100_", "VMID 100", etc.
match = re.search(r'(?:VM|CT|VMID|CTID|vm_|ct_)[\s_]?(\d{3,})', text, re.IGNORECASE)
return match.group(1) if match else None
def get_age_hours(timestamp_str):
"""Get age in hours from ISO timestamp string."""
if not timestamp_str:
return 0
try:
dt = datetime.fromisoformat(timestamp_str)
return (now - dt).total_seconds() / 3600
except (ValueError, TypeError):
return 0
for error_row in active_errors:
err_id, error_key, category, message, first_seen, last_seen, severity = error_row
should_resolve = False
resolution_reason = None
age_hours = get_age_hours(first_seen)
last_seen_hours = get_age_hours(last_seen)
# === VM/CT ERRORS ===
# Check if VM/CT still exists (covers: vms category, vm_*, ct_* error keys)
if category == 'vms' or (error_key and (error_key.startswith('vm_') or error_key.startswith('ct_'))):
vmid = extract_vmid_from_text(error_key) or extract_vmid_from_text(message)
if vmid and not check_vm_ct_cached(vmid):
should_resolve = True
resolution_reason = 'VM/CT deleted'
# === DISK ERRORS ===
# Check if disk device or ZFS pool still exists
elif category == 'disks' or category == 'storage':
if error_key:
# Check for ZFS pool errors (e.g., "zfs_pool_rpool_degraded")
zfs_match = re.search(r'zfs_(?:pool_)?([a-zA-Z0-9_-]+)', error_key)
if zfs_match:
pool_name = zfs_match.group(1)
pools = get_zfs_pools()
if pools and pool_name not in pools:
should_resolve = True
resolution_reason = 'ZFS pool removed'
# Check for disk device errors (e.g., "disk_sdh_io_error", "smart_sda_failing")
if not should_resolve:
disk_match = re.search(r'(?:disk_|smart_|io_error_)([a-z]{2,4}\d*)', error_key)
if disk_match:
disk_name = disk_match.group(1)
disk_path = f'/dev/{disk_name}'
if not os.path.exists(disk_path):
should_resolve = True
resolution_reason = 'Disk device removed'
# Check for mount point errors (e.g., "disk_fs_/mnt/data")
if not should_resolve and 'disk_fs_' in error_key:
mount = error_key.replace('disk_fs_', '').split('_')[0]
if mount.startswith('/'):
mounts = get_mount_points()
if mounts and mount not in mounts:
should_resolve = True
resolution_reason = 'Mount point removed'
# === NETWORK ERRORS ===
# Check if network interface still exists
elif category == 'network':
if error_key:
# Extract interface name (e.g., "net_vmbr1_down" -> "vmbr1", "bond0_slave_error" -> "bond0")
iface_match = re.search(r'(?:net_|bond_|vmbr|eth|eno|ens|enp)([a-zA-Z0-9_]+)?', error_key)
if iface_match:
# Reconstruct full interface name
full_match = re.search(r'((?:vmbr|bond|eth|eno|ens|enp)[a-zA-Z0-9]+)', error_key)
if full_match:
iface = full_match.group(1)
interfaces = get_network_interfaces()
if interfaces and iface not in interfaces:
should_resolve = True
resolution_reason = 'Network interface removed'
# === SERVICE ERRORS ===
# Check if service exists or if it references a deleted CT
elif category in ('services', 'pve_services'):
# First check if it references a CT that no longer exists
vmid = extract_vmid_from_text(message) or extract_vmid_from_text(error_key)
if vmid and not check_vm_ct_cached(vmid):
should_resolve = True
resolution_reason = 'Container deleted'
# For pve_services, check if the service unit exists
if not should_resolve and category == 'pve_services' and error_key:
service_match = re.search(r'service_([a-zA-Z0-9_-]+)', error_key)
if service_match:
service_name = service_match.group(1)
services = get_pve_services_status()
if services and service_name not in services:
should_resolve = True
resolution_reason = 'Service no longer exists'
# === LOG ERRORS ===
# Auto-resolve log errors after 48h (they represent point-in-time issues)
elif category == 'logs' or (error_key and error_key.startswith(('log_persistent_', 'log_spike_', 'log_cascade_', 'log_critical_'))):
if age_hours > 48:
should_resolve = True
resolution_reason = 'Log error aged out (>48h)'
# === CLUSTER ERRORS ===
# Resolve cluster/corosync/qdevice errors if node is no longer in a cluster
elif error_key and any(x in error_key.lower() for x in ('cluster', 'corosync', 'qdevice', 'quorum')):
cluster_info = get_cluster_status()
if not cluster_info['is_cluster']:
should_resolve = True
resolution_reason = 'No longer in cluster'
# === TEMPERATURE ERRORS ===
# Temperature errors - check if sensor still exists (unlikely to change, resolve after 24h of no activity)
elif category == 'temperature':
if last_seen_hours > 24:
should_resolve = True
resolution_reason = 'Temperature error stale (>24h no activity)'
# === UPDATES/SECURITY ERRORS ===
# These are informational - auto-resolve after 7 days if acknowledged or stale
elif category in ('updates', 'security'):
if age_hours > 168: # 7 days
should_resolve = True
resolution_reason = 'Update/security notice aged out (>7d)'
# === FALLBACK: ANY STALE ERROR ===
# Any error that hasn't been seen in 7 days and is older than 7 days
if not should_resolve and age_hours > 168 and last_seen_hours > 168:
should_resolve = True
resolution_reason = 'Stale error (no activity >7d)'
if should_resolve:
cursor.execute('''
UPDATE errors SET resolved_at = ?, resolution_type = 'auto'
WHERE id = ?
''', (now_iso, err_id))
resolved_count += 1
if resolved_count > 0:
conn.commit()
print(f"[HealthPersistence] Auto-resolved {resolved_count} errors for stale/deleted resources")
conn.close()
def _check_vm_ct_exists(self, vmid: str) -> bool:
"""Check if a VM or CT exists (not just running, but exists at all).
Uses 'qm config' and 'pct config' which return success even for stopped VMs/CTs,
but fail if the VM/CT doesn't exist.
"""
import subprocess
try:
# Try VM first
result = subprocess.run(
['qm', 'config', vmid],
capture_output=True,
text=True,
timeout=3
)
if result.returncode == 0:
return True
# Try CT
result = subprocess.run(
['pct', 'config', vmid],
capture_output=True,
text=True,
timeout=3
)
if result.returncode == 0:
return True
return False
except Exception:
# On error, assume it exists to avoid false positives
return True
def check_vm_running(self, vm_id: str) -> bool:
"""
+2 -2
View File
@@ -28,7 +28,7 @@ from pathlib import Path
# ─── Shared State for Cross-Watcher Coordination ──────────────────
# ─── Startup Grace Period ───────────────────────────────────────────────────
# ─── Startup Grace Period ───────────────────────────────────────────────────
# Import centralized startup grace management
# This provides a single source of truth for all grace period logic
import startup_grace
@@ -2610,7 +2610,7 @@ class PollingCollector:
pass
# ─── Proxmox Webhook Receiver ──────────────────────────────────
# ─── Proxmox Webhook Receiver ──────────────────────────────────
class ProxmoxHookWatcher:
"""Receives native Proxmox VE notifications via local webhook endpoint.
+33 -15
View File
@@ -44,6 +44,13 @@ from notification_events import (
ProxmoxHookWatcher,
)
# AI context enrichment (uptime, frequency, SMART data, known errors)
try:
from ai_context_enrichment import enrich_context_for_ai
except ImportError:
def enrich_context_for_ai(title, body, event_type, data, journal_context='', detail_level='standard'):
return journal_context
# ─── Constants ────────────────────────────────────────────────────
@@ -743,10 +750,10 @@ class NotificationManager:
'ai_custom_prompt': self._config.get('ai_custom_prompt', ''),
}
# Get journal context if available
journal_context = data.get('_journal_context', '')
for ch_name, channel in channels.items():
# Get journal context if available (will be enriched per-channel based on detail_level)
raw_journal_context = data.get('_journal_context', '')
for ch_name, channel in channels.items():
# ── Per-channel category check ──
# Default: category enabled (true) unless explicitly disabled.
ch_group_key = f'{ch_name}.events.{event_group}'
@@ -771,17 +778,28 @@ class NotificationManager:
rich_key = f'{ch_name}.rich_format'
use_rich_format = self._config.get(rich_key, 'false') == 'true'
# ── Per-channel AI enhancement ──
# Apply AI with channel-specific detail level and emoji setting
# If AI is enabled AND rich_format is on, AI will include emojis directly
# Pass channel_type so AI knows whether to append original (email only)
channel_ai_config = {**ai_config, 'channel_type': ch_name}
ai_result = format_with_ai_full(
ch_title, ch_body, severity, channel_ai_config,
detail_level=detail_level,
journal_context=journal_context,
use_emojis=use_rich_format
)
# ── Per-channel AI enhancement ──
# Apply AI with channel-specific detail level and emoji setting
# If AI is enabled AND rich_format is on, AI will include emojis directly
# Pass channel_type so AI knows whether to append original (email only)
channel_ai_config = {**ai_config, 'channel_type': ch_name}
# Enrich context with uptime, frequency, SMART data, and known errors
enriched_context = enrich_context_for_ai(
title=ch_title,
body=ch_body,
event_type=event_type,
data=data,
journal_context=raw_journal_context,
detail_level=detail_level
)
ai_result = format_with_ai_full(
ch_title, ch_body, severity, channel_ai_config,
detail_level=detail_level,
journal_context=enriched_context,
use_emojis=use_rich_format
)
ch_title = ai_result.get('title', ch_title)
ch_body = ai_result.get('body', ch_body)
+45 -18
View File
@@ -1384,7 +1384,13 @@ AI_DETAIL_TOKENS = {
# System prompt template - optimized hybrid version
AI_SYSTEM_PROMPT = """You are a notification FORMATTER for ProxMenux Monitor (Proxmox VE).
Your job: translate and reformat alerts into {language}. You are NOT an analyst do not interpret or diagnose.
Your job: translate alerts into {language} and enrich them with context when provided.
ABSOLUTE CONSTRAINTS (NO EXCEPTIONS)
- NO HALLUCINATIONS: Do not invent causes, solutions, or facts not present in the provided data
- NO SPECULATION: If something is unclear, state what IS known, not what MIGHT be
- NO CONVERSATIONAL TEXT: Never write "Here is...", "I've translated...", "Let me explain..."
- ONLY use information from: the message, journal context, and known error database (if provided)
WHAT TO TRANSLATE
Translate: labels, descriptions, status words, units (GBGo in French, etc.)
@@ -1394,15 +1400,37 @@ DO NOT translate: hostnames, IPs, paths, VM/CT IDs, device names (/dev/sdX), tec
1. Plain text only NO markdown, no **bold**, no `code`, no bullet lists (use "" for packages only)
2. Preserve severity: "failed" stays "failed", "warning" stays "warning" never soften errors
3. Preserve structure: keep same fields and line order, only translate content
4. Detail level "{detail_level}": brief (2-3 lines) | standard (short paragraph) | detailed (full report)
4. Detail level "{detail_level}":
- brief: 1-2 lines, essential facts only
- standard: short paragraph, key details and context
- detailed: full report with all available information, step-by-step if applicable
5. DEDUPLICATION: merge duplicate facts from multiple sources into one clear statement
6. EMPTY LISTS: write translated "none" after label, never leave blank
7. Keep "hostname:" prefix in title translate only the descriptive part
8. DO NOT add recommendations or suggestions ("you should...", "try...", "consider...")
{suggestions_addon}9. Present facts from message AND journal context describe what happened, do NOT speculate
10. OUTPUT ONLY the final result no "Original:", no before/after comparisons
11. Unknown input: preserve as closely as possible, translate what you can
8. DO NOT add recommendations or suggestions UNLESS AI Suggestions mode is enabled below
9. ENRICHED CONTEXT: You may receive additional context data including:
- "System uptime: X days (stable system)" helps distinguish startup issues from runtime failures
- "Event frequency: N occurrences, first seen X ago" indicates recurring vs one-time issues
- "SMART Health: PASSED/FAILED" with disk attributes critical for disk errors
- "KNOWN PROXMOX ERROR DETECTED" with cause/solution YOU MUST USE this exact information
How to use enriched context:
- If uptime is <10min and error is service-related mention "occurred shortly after boot"
- If frequency shows recurring pattern mention "recurring issue (N times in X hours)"
- If SMART shows FAILED treat as CRITICAL: "Disk failing - immediate attention required"
- If KNOWN ERROR is provided YOU MUST incorporate its Cause and Solution (translate, don't copy verbatim)
10. JOURNAL CONTEXT EXTRACTION: When journal logs are provided:
- Extract specific IDs (VM/CT numbers, disk devices, service names)
- Include relevant timestamps if they help explain the timeline
- Identify root cause when logs clearly show it (e.g., "exit-code 255" -> "process crashed")
- Translate technical terms: "Emask 0x10" -> "ATA bus error", "DRDY ERR" -> "drive not ready"
- If logs show the same error repeating, state frequency: "occurred 15 times in 10 minutes"
- IGNORE journal entries unrelated to the main event
11. OUTPUT ONLY the final result no "Original:", no before/after comparisons
12. Unknown input: preserve as closely as possible, translate what you can
13. REDUNDANCY: Never repeat the same information twice. If title says "CT 103 failed", body should not start with "Container 103 failed"
{suggestions_addon}
PROXMOX MAPPINGS (use directly, never explain)
pve-container@XXXX "CT XXXX" | qemu-server@XXXX "VM XXXX" | vzdump "backup"
pveproxy/pvedaemon/pvestatd "Proxmox service" | corosync "cluster service"
@@ -1457,18 +1485,17 @@ CORRECT (markers are separators only):
# Addon for experimental suggestions mode
AI_SUGGESTIONS_ADDON = """
EXCEPTION TO RULE 8 (Suggestions enabled): When journal context shows a clear, actionable problem,
you MAY add ONE brief suggestion at the END of the body (after all facts), using this format:
💡 Tip: [your suggestion here]
Guidelines for suggestions:
- Only suggest when the problem AND solution are clear from the logs
- Keep it to ONE line, max 100 characters
- Be specific: "Check disk /dev/sdb SMART status" not "Check your disks"
- Use commands when helpful: "Run 'systemctl restart pvedaemon'"
- Never speculate - only suggest based on evidence in the logs
- Skip the tip entirely if the problem is unclear or already resolved
AI SUGGESTIONS MODE (ENABLED)
You MAY add ONE brief, actionable tip at the END of the body using this exact format:
💡 Tip: [your concise suggestion here]
Rules for the tip:
- ONLY include if the log context or Known Error database clearly points to a specific fix
- Keep under 100 characters
- Be specific: "Run 'pvecm status' to check quorum" NOT "Check cluster status"
- If Known Error provides a solution, YOU MUST USE IT (don't invent your own)
- Never guess skip the tip if the cause/solution is unclear
"""
# Emoji instructions injected into AI_SYSTEM_PROMPT for rich channels (Telegram, Discord, Pushover)
+348
View File
@@ -0,0 +1,348 @@
#!/usr/bin/env python3
"""
Database of known Proxmox/Linux errors with causes, solutions, and severity levels.
This provides the AI with accurate, pre-verified information about common errors,
reducing hallucinations and ensuring consistent, helpful responses.
Each entry includes:
- pattern: regex pattern to match against error messages/logs
- cause: brief explanation of what causes this error
- cause_detailed: more comprehensive explanation for detailed mode
- severity: info, warning, critical
- solution: brief actionable solution
- solution_detailed: step-by-step solution for detailed mode
- url: optional documentation link
"""
import re
from typing import Optional, Dict, Any, List
# Known error patterns with causes and solutions
PROXMOX_KNOWN_ERRORS: List[Dict[str, Any]] = [
# ==================== SUBSCRIPTION/LICENSE ====================
{
"pattern": r"no valid subscription|subscription.*invalid|not subscribed",
"cause": "Proxmox enterprise repository requires paid subscription",
"cause_detailed": "Proxmox VE uses a subscription model for enterprise features. Without a valid subscription key, access to the enterprise repository is denied. This is normal for home/lab users.",
"severity": "info",
"solution": "Use no-subscription repository or purchase subscription",
"solution_detailed": "For home/lab use: Switch to the no-subscription repository by editing /etc/apt/sources.list.d/pve-enterprise.list. For production: Purchase a subscription at proxmox.com/pricing",
"url": "https://pve.proxmox.com/wiki/Package_Repositories",
"category": "updates"
},
# ==================== CLUSTER/COROSYNC ====================
{
"pattern": r"quorum.*lost|lost.*quorum|not.*quorate",
"cause": "Cluster lost majority of voting nodes",
"cause_detailed": "Corosync cluster requires more than 50% of configured votes to maintain quorum. When quorum is lost, the cluster becomes read-only to prevent split-brain scenarios.",
"severity": "critical",
"solution": "Check network connectivity between nodes; ensure majority of nodes are online",
"solution_detailed": "1. Verify network connectivity: ping all cluster nodes\n2. Check corosync status: systemctl status corosync\n3. View cluster status: pvecm status\n4. If nodes are unreachable, check firewall rules (ports 5405-5412 UDP)\n5. For emergency single-node operation: pvecm expected 1",
"url": "https://pve.proxmox.com/wiki/Cluster_Manager",
"category": "cluster"
},
{
"pattern": r"corosync.*qdevice.*error|qdevice.*connection.*failed|qdevice.*not.*connected",
"cause": "QDevice helper node is unreachable",
"cause_detailed": "The Corosync QDevice provides an additional vote for 2-node clusters. When it cannot connect, the cluster may lose quorum if one node fails.",
"severity": "warning",
"solution": "Check QDevice server connectivity and corosync-qnetd service",
"solution_detailed": "1. Verify QDevice server is running: systemctl status corosync-qnetd (on QDevice host)\n2. Check connectivity: nc -zv <qdevice-ip> 5403\n3. Restart qdevice: systemctl restart corosync-qdevice\n4. Check certificates: corosync-qdevice-net-certutil -s",
"url": "https://pve.proxmox.com/wiki/Cluster_Manager#_corosync_external_vote_support",
"category": "cluster"
},
{
"pattern": r"corosync.*retransmit|corosync.*token.*timeout|ring.*mark.*faulty",
"cause": "Network latency or packet loss between cluster nodes",
"cause_detailed": "Corosync uses multicast/unicast for cluster communication. High latency, packet loss, or network congestion causes token timeouts and retransmissions, potentially leading to node eviction.",
"severity": "warning",
"solution": "Check network quality between nodes; consider increasing token timeout",
"solution_detailed": "1. Test network latency: ping -c 100 <other-node>\n2. Check for packet loss between nodes\n3. Verify MTU settings match on all interfaces\n4. Increase token timeout in /etc/pve/corosync.conf if needed (default 1000ms)\n5. Check switch/router for congestion",
"category": "cluster"
},
# ==================== DISK/STORAGE ====================
{
"pattern": r"SMART.*FAILED|smart.*failed.*health|Pre-fail|Old_age.*FAILING",
"cause": "Disk SMART health check failed - disk is failing",
"cause_detailed": "SMART (Self-Monitoring, Analysis and Reporting Technology) detected critical disk health issues. The disk is likely failing and data loss is imminent.",
"severity": "critical",
"solution": "IMMEDIATELY backup data and replace disk",
"solution_detailed": "1. URGENT: Backup all data from this disk immediately\n2. Check SMART details: smartctl -a /dev/sdX\n3. Note the failing attributes (Reallocated_Sector_Ct, Current_Pending_Sector, etc.)\n4. Plan disk replacement\n5. If in RAID/ZFS: initiate disk replacement procedure",
"category": "disks"
},
{
"pattern": r"Reallocated_Sector_Ct.*threshold|reallocated.*sectors?.*exceeded",
"cause": "Disk has excessive bad sectors being remapped",
"cause_detailed": "The disk firmware has remapped multiple bad sectors to spare areas. While the disk is still functioning, this indicates physical degradation and eventual failure.",
"severity": "warning",
"solution": "Monitor closely and plan disk replacement",
"solution_detailed": "1. Check current value: smartctl -A /dev/sdX | grep Reallocated\n2. If value is increasing, plan immediate replacement\n3. Backup important data\n4. Run extended SMART test: smartctl -t long /dev/sdX",
"category": "disks"
},
{
"pattern": r"ata.*error|ATA.*bus.*error|Emask.*0x|DRDY.*ERR|UNC.*error",
"cause": "ATA communication error with disk",
"cause_detailed": "The SATA/ATA controller encountered communication errors with the disk. This can indicate cable issues, controller problems, or disk failure.",
"severity": "warning",
"solution": "Check SATA cables and connections; verify disk health with smartctl",
"solution_detailed": "1. Check SMART health: smartctl -H /dev/sdX\n2. Inspect and reseat SATA cables\n3. Try different SATA port\n4. Check dmesg for pattern of errors\n5. If errors persist, disk may be failing",
"category": "disks"
},
{
"pattern": r"I/O.*error|blk_update_request.*error|Buffer I/O error",
"cause": "Disk I/O operation failed",
"cause_detailed": "The kernel failed to read or write data to the disk. This can be caused by disk failure, cable issues, or filesystem corruption.",
"severity": "critical",
"solution": "Check disk health and connections immediately",
"solution_detailed": "1. Check SMART status: smartctl -H /dev/sdX\n2. Check dmesg for related errors: dmesg | grep -i error\n3. Verify disk is still accessible: lsblk\n4. If ZFS: check pool status with zpool status\n5. Consider filesystem check if safe to unmount",
"category": "disks"
},
{
"pattern": r"zfs.*pool.*DEGRADED|pool.*is.*degraded",
"cause": "ZFS pool has reduced redundancy",
"cause_detailed": "One or more devices in the ZFS pool are unavailable or experiencing errors. The pool is still functional but without full redundancy.",
"severity": "warning",
"solution": "Identify failed device with 'zpool status' and replace",
"solution_detailed": "1. Check pool status: zpool status <pool>\n2. Identify the DEGRADED or UNAVAIL device\n3. If device is present but erroring: zpool scrub <pool>\n4. To replace: zpool replace <pool> <old-device> <new-device>\n5. Monitor resilver progress: zpool status",
"category": "storage"
},
{
"pattern": r"zfs.*pool.*FAULTED|pool.*is.*faulted",
"cause": "ZFS pool is inaccessible",
"cause_detailed": "The ZFS pool has lost too many devices and cannot maintain data integrity. Data may be inaccessible.",
"severity": "critical",
"solution": "Check failed devices; may need data recovery",
"solution_detailed": "1. Check status: zpool status <pool>\n2. Identify all failed devices\n3. Attempt to online devices: zpool online <pool> <device>\n4. If drives are physically present, try zpool clear <pool>\n5. May require data recovery if multiple drives failed",
"category": "storage"
},
# ==================== CEPH ====================
{
"pattern": r"ceph.*OSD.*down|osd\.\d+.*down|ceph.*osd.*failed",
"cause": "Ceph OSD daemon is not running",
"cause_detailed": "A Ceph Object Storage Daemon (OSD) has stopped or crashed. This reduces storage redundancy and may trigger data rebalancing.",
"severity": "warning",
"solution": "Check disk health and restart OSD service",
"solution_detailed": "1. Check OSD status: ceph osd tree\n2. View OSD logs: journalctl -u ceph-osd@<id>\n3. Check underlying disk: smartctl -H /dev/sdX\n4. Restart OSD: systemctl start ceph-osd@<id>\n5. If OSD keeps crashing, check for disk failure",
"category": "storage"
},
{
"pattern": r"ceph.*health.*WARN|HEALTH_WARN",
"cause": "Ceph cluster has warnings",
"cause_detailed": "Ceph detected issues that don't prevent operation but should be addressed. Common causes: degraded PGs, clock skew, full OSDs.",
"severity": "warning",
"solution": "Run 'ceph health detail' for specific issues",
"solution_detailed": "1. Get details: ceph health detail\n2. Common fixes:\n - Degraded PGs: wait for recovery or add capacity\n - Clock skew: sync NTP on all nodes\n - Full OSDs: add storage or delete data\n3. Check: ceph status",
"category": "storage"
},
{
"pattern": r"ceph.*health.*ERR|HEALTH_ERR",
"cause": "Ceph cluster has critical errors",
"cause_detailed": "Ceph has detected critical issues that may affect data availability or integrity. Immediate attention required.",
"severity": "critical",
"solution": "Run 'ceph health detail' and address errors immediately",
"solution_detailed": "1. Get details: ceph health detail\n2. Check OSD status: ceph osd tree\n3. Check MON status: ceph mon stat\n4. View PG status: ceph pg stat\n5. Address each error shown in health detail",
"category": "storage"
},
# ==================== VM/CT ERRORS ====================
{
"pattern": r"TASK ERROR.*failed to get exclusive lock|lock.*timeout|couldn't acquire lock",
"cause": "Resource is locked by another operation",
"cause_detailed": "Another task is currently holding a lock on this VM/CT. This prevents concurrent modifications that could cause corruption.",
"severity": "info",
"solution": "Wait for other task to complete or check for stuck tasks",
"solution_detailed": "1. Check running tasks: cat /var/log/pve/tasks/active\n2. Wait for task completion\n3. If task is stuck (>1h), check process: ps aux | grep <vmid>\n4. As last resort, remove lock file: rm /var/lock/qemu-server/lock-<vmid>.conf",
"category": "vms"
},
{
"pattern": r"kvm.*not.*available|kvm.*disabled|hardware.*virtualization.*disabled",
"cause": "KVM/hardware virtualization not available",
"cause_detailed": "The CPU's hardware virtualization extensions (Intel VT-x or AMD-V) are either not supported, not enabled in BIOS, or blocked by another hypervisor.",
"severity": "warning",
"solution": "Enable VT-x/AMD-V in BIOS settings",
"solution_detailed": "1. Reboot into BIOS/UEFI\n2. Find Virtualization settings (often in CPU or Advanced section)\n3. Enable Intel VT-x or AMD-V/SVM\n4. Save and reboot\n5. Verify: grep -E 'vmx|svm' /proc/cpuinfo",
"category": "vms"
},
{
"pattern": r"out of memory|OOM.*kill|cannot allocate memory|memory.*exhausted",
"cause": "System or VM ran out of memory",
"cause_detailed": "The Linux OOM (Out Of Memory) killer terminated a process to free memory. This indicates memory pressure from overcommitment or memory leaks.",
"severity": "critical",
"solution": "Increase memory allocation or reduce VM memory usage",
"solution_detailed": "1. Check what was killed: dmesg | grep -i oom\n2. Review memory usage: free -h\n3. Check balloon driver status for VMs\n4. Consider adding swap or RAM\n5. Review VM memory allocations for overcommitment",
"category": "memory"
},
# ==================== NETWORK ====================
{
"pattern": r"bond.*slave.*link.*down|bond.*no.*active.*slave",
"cause": "Network bond lost a slave interface",
"cause_detailed": "One or more physical interfaces in a network bond have lost link. Depending on bond mode, this may reduce bandwidth or affect failover.",
"severity": "warning",
"solution": "Check physical cable connections and switch ports",
"solution_detailed": "1. Check bond status: cat /proc/net/bonding/bond0\n2. Identify down slave interface\n3. Check physical cable connection\n4. Check switch port status and errors\n5. Verify interface: ethtool <slave-iface>",
"category": "network"
},
{
"pattern": r"link.*not.*ready|carrier.*lost|link.*down|NIC.*Link.*Down",
"cause": "Network interface lost link",
"cause_detailed": "The physical or virtual network interface has lost its connection. This could be a cable issue, switch problem, or driver issue.",
"severity": "warning",
"solution": "Check cable, switch port, and interface status",
"solution_detailed": "1. Check interface: ip link show <iface>\n2. Check cable connection\n3. Check switch port LEDs\n4. Try: ip link set <iface> down && ip link set <iface> up\n5. Check driver: ethtool -i <iface>",
"category": "network"
},
{
"pattern": r"bridge.*STP.*blocked|spanning.*tree.*blocked",
"cause": "Spanning Tree Protocol blocked a port",
"cause_detailed": "STP detected a potential network loop and blocked a bridge port to prevent broadcast storms. This is normal behavior but may indicate network topology issues.",
"severity": "info",
"solution": "Review network topology; this may be expected behavior",
"solution_detailed": "1. Check bridge status: brctl show\n2. View STP state: brctl showstp <bridge>\n3. If unexpected, review network topology for loops\n4. Consider disabling STP if network is simple: brctl stp <bridge> off",
"category": "network"
},
# ==================== SERVICES ====================
{
"pattern": r"pvedaemon.*failed|pveproxy.*failed|pvestatd.*failed",
"cause": "Critical Proxmox service failed",
"cause_detailed": "One of the core Proxmox daemons has crashed or failed to start. This may affect web GUI access or API functionality.",
"severity": "critical",
"solution": "Restart the failed service; check logs for cause",
"solution_detailed": "1. Check status: systemctl status <service>\n2. View logs: journalctl -u <service> -n 50\n3. Restart: systemctl restart <service>\n4. If persistent, check: /var/log/pveproxy/access.log",
"category": "pve_services"
},
{
"pattern": r"failed to start.*service|service.*start.*failed|service.*activation.*failed",
"cause": "System service failed to start",
"cause_detailed": "A systemd service unit failed during startup. This could be due to configuration errors, missing dependencies, or resource issues.",
"severity": "warning",
"solution": "Check service logs with journalctl -u <service>",
"solution_detailed": "1. Check status: systemctl status <service>\n2. View logs: journalctl -xeu <service>\n3. Check config: systemctl cat <service>\n4. Verify dependencies: systemctl list-dependencies <service>\n5. Try restart: systemctl restart <service>",
"category": "services"
},
# ==================== BACKUP ====================
{
"pattern": r"backup.*failed|vzdump.*error|backup.*job.*failed",
"cause": "Backup job failed",
"cause_detailed": "A scheduled or manual backup operation failed. Common causes: storage full, VM locked, network issues for remote storage.",
"severity": "warning",
"solution": "Check backup storage space and VM status",
"solution_detailed": "1. Check backup log in Datacenter > Backup\n2. Verify storage space: df -h\n3. Check if VM is locked: qm list or pct list\n4. Verify backup storage is accessible\n5. Try manual backup to identify specific error",
"category": "backups"
},
# ==================== CERTIFICATES ====================
{
"pattern": r"certificate.*expired|SSL.*certificate.*expired|cert.*expir",
"cause": "SSL/TLS certificate has expired",
"cause_detailed": "An SSL certificate used for secure communication has passed its expiration date. This may cause connection failures or security warnings.",
"severity": "warning",
"solution": "Renew the certificate using pvenode cert set or Let's Encrypt",
"solution_detailed": "1. Check certificate: pvenode cert info\n2. For self-signed renewal: pvecm updatecerts\n3. For Let's Encrypt: pvenode acme cert order\n4. Restart pveproxy after renewal: systemctl restart pveproxy",
"url": "https://pve.proxmox.com/wiki/Certificate_Management",
"category": "security"
},
# ==================== HARDWARE/TEMPERATURE ====================
{
"pattern": r"temperature.*critical|thermal.*critical|CPU.*overheating|temp.*above.*threshold",
"cause": "Component temperature critical",
"cause_detailed": "A hardware component (CPU, disk, etc.) has reached a dangerous temperature. Sustained high temperatures can cause hardware damage or system shutdowns.",
"severity": "critical",
"solution": "Check cooling system immediately; clean dust, verify fans",
"solution_detailed": "1. Check current temps: sensors\n2. Verify all fans are running\n3. Clean dust from heatsinks and filters\n4. Ensure adequate airflow\n5. Consider reapplying thermal paste if CPU\n6. Check ambient room temperature",
"category": "temperature"
},
# ==================== AUTHENTICATION ====================
{
"pattern": r"authentication.*failed|login.*failed|invalid.*credentials|access.*denied",
"cause": "Authentication failure",
"cause_detailed": "A login attempt failed due to invalid credentials or permissions. Multiple failures may indicate a brute-force attack.",
"severity": "info",
"solution": "Verify credentials; check for unauthorized access attempts",
"solution_detailed": "1. Review auth logs: journalctl -u pvedaemon | grep auth\n2. Check for multiple failures from same IP\n3. Verify user exists: pveum user list\n4. If attack suspected, consider fail2ban\n5. Reset password if needed: pveum passwd <user>",
"category": "security"
},
]
def find_matching_error(text: str, category: Optional[str] = None) -> Optional[Dict[str, Any]]:
"""Find a known error that matches the given text.
Args:
text: Error message or log content to match against
category: Optional category to filter by
Returns:
Matching error dict or None
"""
if not text:
return None
text_lower = text.lower()
for error in PROXMOX_KNOWN_ERRORS:
# Filter by category if specified
if category and error.get("category") != category:
continue
try:
if re.search(error["pattern"], text_lower, re.IGNORECASE):
return error
except re.error:
continue
return None
def get_error_context(text: str, category: Optional[str] = None, detail_level: str = "standard") -> Optional[str]:
"""Get formatted context for a known error.
Args:
text: Error message to match
category: Optional category filter
detail_level: "minimal", "standard", or "detailed"
Returns:
Formatted context string or None
"""
error = find_matching_error(text, category)
if not error:
return None
if detail_level == "minimal":
return f"Known issue: {error['cause']}"
elif detail_level == "standard":
lines = [
f"KNOWN PROXMOX ERROR DETECTED:",
f" Cause: {error['cause']}",
f" Severity: {error['severity'].upper()}",
f" Solution: {error['solution']}"
]
if error.get("url"):
lines.append(f" Docs: {error['url']}")
return "\n".join(lines)
else: # detailed
lines = [
f"KNOWN PROXMOX ERROR DETECTED:",
f" Cause: {error.get('cause_detailed', error['cause'])}",
f" Severity: {error['severity'].upper()}",
f" Solution: {error.get('solution_detailed', error['solution'])}"
]
if error.get("url"):
lines.append(f" Documentation: {error['url']}")
return "\n".join(lines)
def get_all_patterns() -> List[str]:
"""Get all error patterns for external use."""
return [error["pattern"] for error in PROXMOX_KNOWN_ERRORS]