- Add SQLAlchemy database setup with session management - Add Job model with status, priority and progress tracking - Add QueueManager with priority queue and deduplication - Add SystemSettings model for database-backed configuration - Add SettingsService with caching and defaults - Add SystemMonitor for CPU/RAM/GPU resource monitoring - Add LanguageCode utilities (moved from root)
295 lines
9.3 KiB
Python
295 lines
9.3 KiB
Python
"""System resource monitoring service."""
|
|
import logging
|
|
import platform
|
|
from typing import Dict, List, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Try to import psutil (CPU/RAM monitoring)
|
|
try:
|
|
import psutil
|
|
PSUTIL_AVAILABLE = True
|
|
except ImportError:
|
|
PSUTIL_AVAILABLE = False
|
|
logger.warning("psutil not installed. CPU/RAM monitoring will be unavailable.")
|
|
|
|
# Try to import pynvml (NVIDIA GPU monitoring)
|
|
try:
|
|
import pynvml
|
|
pynvml.nvmlInit()
|
|
NVML_AVAILABLE = True
|
|
except Exception as e:
|
|
NVML_AVAILABLE = False
|
|
logger.debug(f"pynvml not available: {e}. GPU monitoring will be unavailable.")
|
|
|
|
|
|
class SystemMonitor:
|
|
"""Monitor system resources: CPU, RAM, GPU, VRAM."""
|
|
|
|
def __init__(self):
|
|
"""Initialize system monitor."""
|
|
self.gpu_count = 0
|
|
|
|
if NVML_AVAILABLE:
|
|
try:
|
|
self.gpu_count = pynvml.nvmlDeviceGetCount()
|
|
logger.info(f"Detected {self.gpu_count} NVIDIA GPU(s)")
|
|
except Exception as e:
|
|
logger.warning(f"Could not get GPU count: {e}")
|
|
self.gpu_count = 0
|
|
|
|
def get_cpu_info(self) -> Dict[str, any]:
|
|
"""
|
|
Get CPU usage information.
|
|
|
|
Returns:
|
|
Dictionary with CPU stats
|
|
"""
|
|
if not PSUTIL_AVAILABLE:
|
|
return {
|
|
"available": False,
|
|
"error": "psutil not installed"
|
|
}
|
|
|
|
try:
|
|
cpu_percent = psutil.cpu_percent(interval=0.1, percpu=False)
|
|
cpu_count = psutil.cpu_count(logical=True)
|
|
cpu_count_physical = psutil.cpu_count(logical=False)
|
|
|
|
# Get per-core usage
|
|
cpu_percent_per_core = psutil.cpu_percent(interval=0.1, percpu=True)
|
|
|
|
# Get CPU frequency
|
|
cpu_freq = psutil.cpu_freq()
|
|
freq_current = cpu_freq.current if cpu_freq else None
|
|
freq_max = cpu_freq.max if cpu_freq else None
|
|
|
|
return {
|
|
"available": True,
|
|
"usage_percent": round(cpu_percent, 1),
|
|
"count_logical": cpu_count,
|
|
"count_physical": cpu_count_physical,
|
|
"per_core_usage": [round(p, 1) for p in cpu_percent_per_core],
|
|
"frequency_mhz": round(freq_current, 0) if freq_current else None,
|
|
"frequency_max_mhz": round(freq_max, 0) if freq_max else None,
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error getting CPU info: {e}")
|
|
return {
|
|
"available": False,
|
|
"error": str(e)
|
|
}
|
|
|
|
def get_memory_info(self) -> Dict[str, any]:
|
|
"""
|
|
Get RAM usage information.
|
|
|
|
Returns:
|
|
Dictionary with memory stats
|
|
"""
|
|
if not PSUTIL_AVAILABLE:
|
|
return {
|
|
"available": False,
|
|
"error": "psutil not installed"
|
|
}
|
|
|
|
try:
|
|
mem = psutil.virtual_memory()
|
|
|
|
return {
|
|
"available": True,
|
|
"total_gb": round(mem.total / (1024**3), 2),
|
|
"used_gb": round(mem.used / (1024**3), 2),
|
|
"free_gb": round(mem.available / (1024**3), 2),
|
|
"usage_percent": round(mem.percent, 1),
|
|
"total_bytes": mem.total,
|
|
"used_bytes": mem.used,
|
|
"available_bytes": mem.available,
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error getting memory info: {e}")
|
|
return {
|
|
"available": False,
|
|
"error": str(e)
|
|
}
|
|
|
|
def get_swap_info(self) -> Dict[str, any]:
|
|
"""
|
|
Get swap memory information.
|
|
|
|
Returns:
|
|
Dictionary with swap stats
|
|
"""
|
|
if not PSUTIL_AVAILABLE:
|
|
return {
|
|
"available": False,
|
|
"error": "psutil not installed"
|
|
}
|
|
|
|
try:
|
|
swap = psutil.swap_memory()
|
|
|
|
return {
|
|
"available": True,
|
|
"total_gb": round(swap.total / (1024**3), 2),
|
|
"used_gb": round(swap.used / (1024**3), 2),
|
|
"free_gb": round(swap.free / (1024**3), 2),
|
|
"usage_percent": round(swap.percent, 1),
|
|
"total_bytes": swap.total,
|
|
"used_bytes": swap.used,
|
|
"free_bytes": swap.free,
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error getting swap info: {e}")
|
|
return {
|
|
"available": False,
|
|
"error": str(e)
|
|
}
|
|
|
|
def get_gpu_info(self, device_id: int = 0) -> Dict[str, any]:
|
|
"""
|
|
Get GPU information for a specific device.
|
|
|
|
Args:
|
|
device_id: GPU device ID (default: 0)
|
|
|
|
Returns:
|
|
Dictionary with GPU stats
|
|
"""
|
|
if not NVML_AVAILABLE:
|
|
return {
|
|
"available": False,
|
|
"device_id": device_id,
|
|
"error": "pynvml not available or no NVIDIA GPUs detected"
|
|
}
|
|
|
|
if device_id >= self.gpu_count:
|
|
return {
|
|
"available": False,
|
|
"device_id": device_id,
|
|
"error": f"GPU device {device_id} not found. Only {self.gpu_count} GPU(s) available."
|
|
}
|
|
|
|
try:
|
|
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
|
|
|
|
# Get GPU name
|
|
name = pynvml.nvmlDeviceGetName(handle)
|
|
if isinstance(name, bytes):
|
|
name = name.decode('utf-8')
|
|
|
|
# Get memory info
|
|
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
|
|
# Get utilization
|
|
util = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
|
|
|
# Get temperature
|
|
try:
|
|
temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
|
|
except Exception:
|
|
temp = None
|
|
|
|
# Get power usage
|
|
try:
|
|
power_usage = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0 # Convert mW to W
|
|
power_limit = pynvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000.0
|
|
except Exception:
|
|
power_usage = None
|
|
power_limit = None
|
|
|
|
# Get fan speed
|
|
try:
|
|
fan_speed = pynvml.nvmlDeviceGetFanSpeed(handle)
|
|
except Exception:
|
|
fan_speed = None
|
|
|
|
return {
|
|
"available": True,
|
|
"device_id": device_id,
|
|
"name": name,
|
|
"memory": {
|
|
"total_gb": round(mem_info.total / (1024**3), 2),
|
|
"used_gb": round(mem_info.used / (1024**3), 2),
|
|
"free_gb": round(mem_info.free / (1024**3), 2),
|
|
"usage_percent": round((mem_info.used / mem_info.total) * 100, 1),
|
|
"total_bytes": mem_info.total,
|
|
"used_bytes": mem_info.used,
|
|
"free_bytes": mem_info.free,
|
|
},
|
|
"utilization": {
|
|
"gpu_percent": util.gpu,
|
|
"memory_percent": util.memory,
|
|
},
|
|
"temperature_c": temp,
|
|
"power": {
|
|
"usage_watts": round(power_usage, 1) if power_usage else None,
|
|
"limit_watts": round(power_limit, 1) if power_limit else None,
|
|
"usage_percent": round((power_usage / power_limit) * 100, 1) if (power_usage and power_limit) else None,
|
|
},
|
|
"fan_speed_percent": fan_speed,
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error getting GPU {device_id} info: {e}")
|
|
return {
|
|
"available": False,
|
|
"device_id": device_id,
|
|
"error": str(e)
|
|
}
|
|
|
|
def get_all_gpus_info(self) -> List[Dict[str, any]]:
|
|
"""
|
|
Get information for all available GPUs.
|
|
|
|
Returns:
|
|
List of GPU info dictionaries
|
|
"""
|
|
if not NVML_AVAILABLE or self.gpu_count == 0:
|
|
return []
|
|
|
|
return [self.get_gpu_info(i) for i in range(self.gpu_count)]
|
|
|
|
def get_system_info(self) -> Dict[str, any]:
|
|
"""
|
|
Get general system information.
|
|
|
|
Returns:
|
|
Dictionary with system info
|
|
"""
|
|
return {
|
|
"platform": platform.system(),
|
|
"platform_release": platform.release(),
|
|
"platform_version": platform.version(),
|
|
"architecture": platform.machine(),
|
|
"processor": platform.processor(),
|
|
"python_version": platform.python_version(),
|
|
}
|
|
|
|
def get_all_resources(self) -> Dict[str, any]:
|
|
"""
|
|
Get all system resources in a single call.
|
|
|
|
Returns:
|
|
Comprehensive system resource dictionary
|
|
"""
|
|
return {
|
|
"system": self.get_system_info(),
|
|
"cpu": self.get_cpu_info(),
|
|
"memory": self.get_memory_info(),
|
|
"swap": self.get_swap_info(),
|
|
"gpus": self.get_all_gpus_info(),
|
|
"gpu_count": self.gpu_count,
|
|
}
|
|
|
|
def __del__(self):
|
|
"""Cleanup NVML on destruction."""
|
|
if NVML_AVAILABLE:
|
|
try:
|
|
pynvml.nvmlShutdown()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
# Global system monitor instance
|
|
system_monitor = SystemMonitor()
|