IO test and more SMART parameters
This commit is contained in:
109
hwmonDaemon.py
109
hwmonDaemon.py
@ -61,6 +61,21 @@ class SystemHealthMonitor:
|
||||
'DEFAULT_CATEGORY': 'Hardware',
|
||||
'DEFAULT_ISSUE_TYPE': 'Problem'
|
||||
}
|
||||
PROBLEMATIC_FIRMWARE = {
|
||||
'Samsung': {
|
||||
'EVO860': ['RVT01B6Q', 'RVT02B6Q'], # Known issues with sudden performance drops
|
||||
'EVO870': ['SVT01B6Q'],
|
||||
'PM883': ['HXT7404Q'] # Known issues with TRIM
|
||||
},
|
||||
'Seagate': {
|
||||
'ST8000NM': ['CC64'], # Known issues with NCQ
|
||||
'ST12000NM': ['SN02']
|
||||
},
|
||||
'WDC': {
|
||||
'WD121KRYZ': ['01.01A01'], # RAID rebuild issues
|
||||
'WD141KRYZ': ['02.01A02']
|
||||
}
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
|
||||
@ -293,6 +308,56 @@ class SystemHealthMonitor:
|
||||
return False
|
||||
return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+|mmcblk\d+)$', base_device))
|
||||
|
||||
def _check_disk_firmware(self, device: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Check disk firmware version against known problematic versions.
|
||||
"""
|
||||
firmware_info = {
|
||||
'version': None,
|
||||
'model': None,
|
||||
'manufacturer': None,
|
||||
'is_problematic': False,
|
||||
'known_issues': []
|
||||
}
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['smartctl', '-i', device],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
|
||||
for line in result.stdout.split('\n'):
|
||||
if 'Firmware Version:' in line:
|
||||
firmware_info['version'] = line.split(':')[1].strip()
|
||||
elif 'Model Family:' in line:
|
||||
firmware_info['model'] = line.split(':')[1].strip()
|
||||
elif 'Device Model:' in line:
|
||||
if not firmware_info['model']:
|
||||
firmware_info['model'] = line.split(':')[1].strip()
|
||||
|
||||
# Determine manufacturer
|
||||
for manufacturer in self.PROBLEMATIC_FIRMWARE.keys():
|
||||
if manufacturer in firmware_info['model']:
|
||||
firmware_info['manufacturer'] = manufacturer
|
||||
break
|
||||
|
||||
# Check against known problematic versions
|
||||
if firmware_info['manufacturer'] and firmware_info['model']:
|
||||
for model, versions in self.PROBLEMATIC_FIRMWARE[firmware_info['manufacturer']].items():
|
||||
if model in firmware_info['model'] and firmware_info['version'] in versions:
|
||||
firmware_info['is_problematic'] = True
|
||||
firmware_info['known_issues'].append(
|
||||
f"Known problematic firmware version {firmware_info['version']} "
|
||||
f"for {firmware_info['model']}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
firmware_info['known_issues'].append(f"Error checking firmware: {str(e)}")
|
||||
|
||||
return firmware_info
|
||||
|
||||
def _check_smart_health(self, device: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Enhanced SMART health check with detailed failure thresholds.
|
||||
@ -316,13 +381,28 @@ class SystemHealthMonitor:
|
||||
'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
|
||||
'Power_On_Hours': {'warning': 35040, 'critical': 43800}, # ~4-5 years
|
||||
'Media_Wearout_Indicator': {'warning': 30, 'critical': 10}, # Percentage remaining
|
||||
'Temperature_Celsius': {'warning': 65, 'critical': 75}
|
||||
'Temperature_Celsius': {'warning': 65, 'critical': 75},
|
||||
'Host_Writes_32MiB': {'warning': 50000000, 'critical': 100000000},
|
||||
'Wear_Leveling_Count': {'warning': 50, 'critical': 20},
|
||||
'Available_Spare': {'warning': 30, 'critical': 10},
|
||||
'Program_Fail_Count': {'warning': 10, 'critical': 20},
|
||||
'Erase_Fail_Count': {'warning': 10, 'critical': 20},
|
||||
'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100},
|
||||
'Seek_Error_Rate': {'warning': 50, 'critical': 100},
|
||||
'Load_Cycle_Count': {'warning': 300000, 'critical': 600000},
|
||||
'SSD_Life_Left': {'warning': 30, 'critical': 10}
|
||||
}
|
||||
|
||||
try:
|
||||
# Get detailed SMART data
|
||||
# Get firmware information
|
||||
firmware_info = self._check_disk_firmware(device)
|
||||
if firmware_info['is_problematic']:
|
||||
smart_health['severity'] = 'WARNING'
|
||||
smart_health['issues'].extend(firmware_info['known_issues'])
|
||||
|
||||
# Get detailed SMART data including performance metrics
|
||||
result = subprocess.run(
|
||||
['smartctl', '-A', '-H', '-l', 'error', device],
|
||||
['smartctl', '-A', '-H', '-l', 'error', '-l', 'background', device],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
@ -377,6 +457,29 @@ class SystemHealthMonitor:
|
||||
smart_health['severity'] = 'WARNING'
|
||||
smart_health['issues'].extend(recent_errors)
|
||||
|
||||
smart_health['performance_metrics'] = {
|
||||
'read_speed': None,
|
||||
'write_speed': None,
|
||||
'access_time': None
|
||||
}
|
||||
|
||||
# Quick performance test
|
||||
try:
|
||||
perf_result = subprocess.run(
|
||||
['hdparm', '-Tt', device],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
|
||||
for line in perf_result.stdout.split('\n'):
|
||||
if 'buffered disk reads' in line:
|
||||
smart_health['performance_metrics']['read_speed'] = float(line.split()[0])
|
||||
elif 'cached reads' in line:
|
||||
smart_health['performance_metrics']['cached_speed'] = float(line.split()[0])
|
||||
except:
|
||||
pass # Skip performance metrics if hdparm fails
|
||||
|
||||
except Exception as e:
|
||||
smart_health['status'] = 'ERROR'
|
||||
smart_health['severity'] = 'UNKNOWN'
|
||||
|
||||
Reference in New Issue
Block a user