IO test and more SMART parameters
This commit is contained in:
109
hwmonDaemon.py
109
hwmonDaemon.py
@ -61,6 +61,21 @@ class SystemHealthMonitor:
|
|||||||
'DEFAULT_CATEGORY': 'Hardware',
|
'DEFAULT_CATEGORY': 'Hardware',
|
||||||
'DEFAULT_ISSUE_TYPE': 'Problem'
|
'DEFAULT_ISSUE_TYPE': 'Problem'
|
||||||
}
|
}
|
||||||
|
PROBLEMATIC_FIRMWARE = {
|
||||||
|
'Samsung': {
|
||||||
|
'EVO860': ['RVT01B6Q', 'RVT02B6Q'], # Known issues with sudden performance drops
|
||||||
|
'EVO870': ['SVT01B6Q'],
|
||||||
|
'PM883': ['HXT7404Q'] # Known issues with TRIM
|
||||||
|
},
|
||||||
|
'Seagate': {
|
||||||
|
'ST8000NM': ['CC64'], # Known issues with NCQ
|
||||||
|
'ST12000NM': ['SN02']
|
||||||
|
},
|
||||||
|
'WDC': {
|
||||||
|
'WD121KRYZ': ['01.01A01'], # RAID rebuild issues
|
||||||
|
'WD141KRYZ': ['02.01A02']
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
|
ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
|
||||||
@ -293,6 +308,56 @@ class SystemHealthMonitor:
|
|||||||
return False
|
return False
|
||||||
return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+|mmcblk\d+)$', base_device))
|
return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+|mmcblk\d+)$', base_device))
|
||||||
|
|
||||||
|
def _check_disk_firmware(self, device: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Check disk firmware version against known problematic versions.
|
||||||
|
"""
|
||||||
|
firmware_info = {
|
||||||
|
'version': None,
|
||||||
|
'model': None,
|
||||||
|
'manufacturer': None,
|
||||||
|
'is_problematic': False,
|
||||||
|
'known_issues': []
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
['smartctl', '-i', device],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
|
||||||
|
for line in result.stdout.split('\n'):
|
||||||
|
if 'Firmware Version:' in line:
|
||||||
|
firmware_info['version'] = line.split(':')[1].strip()
|
||||||
|
elif 'Model Family:' in line:
|
||||||
|
firmware_info['model'] = line.split(':')[1].strip()
|
||||||
|
elif 'Device Model:' in line:
|
||||||
|
if not firmware_info['model']:
|
||||||
|
firmware_info['model'] = line.split(':')[1].strip()
|
||||||
|
|
||||||
|
# Determine manufacturer
|
||||||
|
for manufacturer in self.PROBLEMATIC_FIRMWARE.keys():
|
||||||
|
if manufacturer in firmware_info['model']:
|
||||||
|
firmware_info['manufacturer'] = manufacturer
|
||||||
|
break
|
||||||
|
|
||||||
|
# Check against known problematic versions
|
||||||
|
if firmware_info['manufacturer'] and firmware_info['model']:
|
||||||
|
for model, versions in self.PROBLEMATIC_FIRMWARE[firmware_info['manufacturer']].items():
|
||||||
|
if model in firmware_info['model'] and firmware_info['version'] in versions:
|
||||||
|
firmware_info['is_problematic'] = True
|
||||||
|
firmware_info['known_issues'].append(
|
||||||
|
f"Known problematic firmware version {firmware_info['version']} "
|
||||||
|
f"for {firmware_info['model']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
firmware_info['known_issues'].append(f"Error checking firmware: {str(e)}")
|
||||||
|
|
||||||
|
return firmware_info
|
||||||
|
|
||||||
def _check_smart_health(self, device: str) -> Dict[str, Any]:
|
def _check_smart_health(self, device: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Enhanced SMART health check with detailed failure thresholds.
|
Enhanced SMART health check with detailed failure thresholds.
|
||||||
@ -316,13 +381,28 @@ class SystemHealthMonitor:
|
|||||||
'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
|
'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
|
||||||
'Power_On_Hours': {'warning': 35040, 'critical': 43800}, # ~4-5 years
|
'Power_On_Hours': {'warning': 35040, 'critical': 43800}, # ~4-5 years
|
||||||
'Media_Wearout_Indicator': {'warning': 30, 'critical': 10}, # Percentage remaining
|
'Media_Wearout_Indicator': {'warning': 30, 'critical': 10}, # Percentage remaining
|
||||||
'Temperature_Celsius': {'warning': 65, 'critical': 75}
|
'Temperature_Celsius': {'warning': 65, 'critical': 75},
|
||||||
|
'Host_Writes_32MiB': {'warning': 50000000, 'critical': 100000000},
|
||||||
|
'Wear_Leveling_Count': {'warning': 50, 'critical': 20},
|
||||||
|
'Available_Spare': {'warning': 30, 'critical': 10},
|
||||||
|
'Program_Fail_Count': {'warning': 10, 'critical': 20},
|
||||||
|
'Erase_Fail_Count': {'warning': 10, 'critical': 20},
|
||||||
|
'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100},
|
||||||
|
'Seek_Error_Rate': {'warning': 50, 'critical': 100},
|
||||||
|
'Load_Cycle_Count': {'warning': 300000, 'critical': 600000},
|
||||||
|
'SSD_Life_Left': {'warning': 30, 'critical': 10}
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get detailed SMART data
|
# Get firmware information
|
||||||
|
firmware_info = self._check_disk_firmware(device)
|
||||||
|
if firmware_info['is_problematic']:
|
||||||
|
smart_health['severity'] = 'WARNING'
|
||||||
|
smart_health['issues'].extend(firmware_info['known_issues'])
|
||||||
|
|
||||||
|
# Get detailed SMART data including performance metrics
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['smartctl', '-A', '-H', '-l', 'error', device],
|
['smartctl', '-A', '-H', '-l', 'error', '-l', 'background', device],
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE,
|
stderr=subprocess.PIPE,
|
||||||
text=True
|
text=True
|
||||||
@ -377,6 +457,29 @@ class SystemHealthMonitor:
|
|||||||
smart_health['severity'] = 'WARNING'
|
smart_health['severity'] = 'WARNING'
|
||||||
smart_health['issues'].extend(recent_errors)
|
smart_health['issues'].extend(recent_errors)
|
||||||
|
|
||||||
|
smart_health['performance_metrics'] = {
|
||||||
|
'read_speed': None,
|
||||||
|
'write_speed': None,
|
||||||
|
'access_time': None
|
||||||
|
}
|
||||||
|
|
||||||
|
# Quick performance test
|
||||||
|
try:
|
||||||
|
perf_result = subprocess.run(
|
||||||
|
['hdparm', '-Tt', device],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
|
||||||
|
for line in perf_result.stdout.split('\n'):
|
||||||
|
if 'buffered disk reads' in line:
|
||||||
|
smart_health['performance_metrics']['read_speed'] = float(line.split()[0])
|
||||||
|
elif 'cached reads' in line:
|
||||||
|
smart_health['performance_metrics']['cached_speed'] = float(line.split()[0])
|
||||||
|
except:
|
||||||
|
pass # Skip performance metrics if hdparm fails
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
smart_health['status'] = 'ERROR'
|
smart_health['status'] = 'ERROR'
|
||||||
smart_health['severity'] = 'UNKNOWN'
|
smart_health['severity'] = 'UNKNOWN'
|
||||||
|
|||||||
Reference in New Issue
Block a user