More Detailed SMART Monitoring
This commit is contained in:
@ -295,18 +295,34 @@ class SystemHealthMonitor:
|
||||
|
||||
def _check_smart_health(self, device: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Check comprehensive SMART health metrics for a drive.
|
||||
Enhanced SMART health check with detailed failure thresholds.
|
||||
"""
|
||||
smart_health = {
|
||||
'status': 'HEALTHY',
|
||||
'severity': 'NORMAL',
|
||||
'issues': [],
|
||||
'temp': None,
|
||||
'attributes': {}
|
||||
}
|
||||
|
||||
# Define critical SMART attributes and their thresholds
|
||||
SMART_THRESHOLDS = {
|
||||
'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10},
|
||||
'Current_Pending_Sector': {'warning': 1, 'critical': 5},
|
||||
'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
|
||||
'Reported_Uncorrect': {'warning': 1, 'critical': 2},
|
||||
'Spin_Retry_Count': {'warning': 1, 'critical': 5},
|
||||
'Command_Timeout': {'warning': 5, 'critical': 10},
|
||||
'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
|
||||
'Power_On_Hours': {'warning': 35040, 'critical': 43800}, # ~4-5 years
|
||||
'Media_Wearout_Indicator': {'warning': 30, 'critical': 10}, # Percentage remaining
|
||||
'Temperature_Celsius': {'warning': 65, 'critical': 75}
|
||||
}
|
||||
|
||||
try:
|
||||
# Get detailed SMART data
|
||||
result = subprocess.run(
|
||||
['smartctl', '-A', '-H', device],
|
||||
['smartctl', '-A', '-H', '-l', 'error', device],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
@ -314,46 +330,60 @@ class SystemHealthMonitor:
|
||||
|
||||
output = result.stdout
|
||||
|
||||
# Check overall SMART status first
|
||||
# Check overall health status
|
||||
if 'FAILED' in output and 'PASSED' not in output:
|
||||
smart_health['status'] = 'UNHEALTHY'
|
||||
smart_health['severity'] = 'CRITICAL'
|
||||
smart_health['issues'].append("SMART overall health check failed")
|
||||
|
||||
# Parse SMART attributes
|
||||
# Parse SMART attributes with thresholds
|
||||
for line in output.split('\n'):
|
||||
if 'ATTRIBUTE_NAME' in line:
|
||||
continue
|
||||
|
||||
# Check for current failures only
|
||||
if 'WHEN_FAILED' in line and 'In_the_past' not in line and '-' not in line:
|
||||
smart_health['status'] = 'UNHEALTHY'
|
||||
smart_health['issues'].append(f"Current failure detected: {line}")
|
||||
|
||||
# Monitor critical attributes
|
||||
for attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector',
|
||||
'Offline_Uncorrectable', 'Reported_Uncorrect']:
|
||||
for attr, thresholds in SMART_THRESHOLDS.items():
|
||||
if attr in line:
|
||||
parts = line.split()
|
||||
if len(parts) >= 10:
|
||||
raw_value = int(parts[9])
|
||||
if raw_value > 0:
|
||||
smart_health['status'] = 'UNHEALTHY'
|
||||
smart_health['issues'].append(f"{attr} has value {raw_value}")
|
||||
smart_health['attributes'][attr] = raw_value
|
||||
|
||||
# Check temperature
|
||||
if 'Temperature_Celsius' in line or 'Airflow_Temperature_Cel' in line:
|
||||
parts = line.split()
|
||||
if len(parts) >= 10:
|
||||
temp = int(parts[9])
|
||||
smart_health['temp'] = temp
|
||||
if temp > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']:
|
||||
smart_health['issues'].append(f"High drive temperature: {temp}°C")
|
||||
if attr == 'Temperature_Celsius':
|
||||
smart_health['temp'] = raw_value
|
||||
if raw_value >= thresholds['critical']:
|
||||
smart_health['severity'] = 'CRITICAL'
|
||||
smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
|
||||
elif raw_value >= thresholds['warning']:
|
||||
smart_health['severity'] = 'WARNING'
|
||||
smart_health['issues'].append(f"High temperature: {raw_value}°C")
|
||||
else:
|
||||
if raw_value >= thresholds['critical']:
|
||||
smart_health['severity'] = 'CRITICAL'
|
||||
smart_health['issues'].append(f"Critical {attr}: {raw_value}")
|
||||
elif raw_value >= thresholds['warning']:
|
||||
if smart_health['severity'] != 'CRITICAL':
|
||||
smart_health['severity'] = 'WARNING'
|
||||
smart_health['issues'].append(f"Warning {attr}: {raw_value}")
|
||||
|
||||
# Check for recent SMART errors
|
||||
error_log_pattern = r"Error \d+ occurred at disk power-on lifetime: (\d+) hours"
|
||||
error_matches = re.finditer(error_log_pattern, output)
|
||||
recent_errors = []
|
||||
|
||||
for match in error_matches:
|
||||
error_hour = int(match.group(1))
|
||||
current_hours = smart_health['attributes'].get('Power_On_Hours', 0)
|
||||
if current_hours - error_hour < 168: # Errors within last week
|
||||
recent_errors.append(match.group(0))
|
||||
|
||||
if recent_errors:
|
||||
smart_health['severity'] = 'WARNING'
|
||||
smart_health['issues'].extend(recent_errors)
|
||||
|
||||
except Exception as e:
|
||||
smart_health['status'] = 'ERROR'
|
||||
smart_health['severity'] = 'UNKNOWN'
|
||||
smart_health['issues'].append(f"Error checking SMART: {str(e)}")
|
||||
|
||||
return smart_health
|
||||
|
||||
def _check_drives_health(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Check overall health of physical SATA and NVMe drives including disk usage and SMART status.
|
||||
|
||||
Reference in New Issue
Block a user