IO test and more SMART parameters

This commit is contained in:
2025-03-03 16:38:02 -05:00
parent 6a498ed33a
commit 8660b3b6e4

View File

@ -61,6 +61,21 @@ class SystemHealthMonitor:
'DEFAULT_CATEGORY': 'Hardware',
'DEFAULT_ISSUE_TYPE': 'Problem'
}
PROBLEMATIC_FIRMWARE = {
'Samsung': {
'EVO860': ['RVT01B6Q', 'RVT02B6Q'], # Known issues with sudden performance drops
'EVO870': ['SVT01B6Q'],
'PM883': ['HXT7404Q'] # Known issues with TRIM
},
'Seagate': {
'ST8000NM': ['CC64'], # Known issues with NCQ
'ST12000NM': ['SN02']
},
'WDC': {
'WD121KRYZ': ['01.01A01'], # RAID rebuild issues
'WD141KRYZ': ['02.01A02']
}
}
def __init__(self,
ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
@ -293,6 +308,56 @@ class SystemHealthMonitor:
return False
return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+|mmcblk\d+)$', base_device))
def _check_disk_firmware(self, device: str) -> Dict[str, Any]:
"""
Check disk firmware version against known problematic versions.
"""
firmware_info = {
'version': None,
'model': None,
'manufacturer': None,
'is_problematic': False,
'known_issues': []
}
try:
result = subprocess.run(
['smartctl', '-i', device],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
for line in result.stdout.split('\n'):
if 'Firmware Version:' in line:
firmware_info['version'] = line.split(':')[1].strip()
elif 'Model Family:' in line:
firmware_info['model'] = line.split(':')[1].strip()
elif 'Device Model:' in line:
if not firmware_info['model']:
firmware_info['model'] = line.split(':')[1].strip()
# Determine manufacturer
for manufacturer in self.PROBLEMATIC_FIRMWARE.keys():
if manufacturer in firmware_info['model']:
firmware_info['manufacturer'] = manufacturer
break
# Check against known problematic versions
if firmware_info['manufacturer'] and firmware_info['model']:
for model, versions in self.PROBLEMATIC_FIRMWARE[firmware_info['manufacturer']].items():
if model in firmware_info['model'] and firmware_info['version'] in versions:
firmware_info['is_problematic'] = True
firmware_info['known_issues'].append(
f"Known problematic firmware version {firmware_info['version']} "
f"for {firmware_info['model']}"
)
except Exception as e:
firmware_info['known_issues'].append(f"Error checking firmware: {str(e)}")
return firmware_info
def _check_smart_health(self, device: str) -> Dict[str, Any]:
"""
Enhanced SMART health check with detailed failure thresholds.
@ -316,13 +381,28 @@ class SystemHealthMonitor:
'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
'Power_On_Hours': {'warning': 35040, 'critical': 43800}, # ~4-5 years
'Media_Wearout_Indicator': {'warning': 30, 'critical': 10}, # Percentage remaining
'Temperature_Celsius': {'warning': 65, 'critical': 75}
'Temperature_Celsius': {'warning': 65, 'critical': 75},
'Host_Writes_32MiB': {'warning': 50000000, 'critical': 100000000},
'Wear_Leveling_Count': {'warning': 50, 'critical': 20},
'Available_Spare': {'warning': 30, 'critical': 10},
'Program_Fail_Count': {'warning': 10, 'critical': 20},
'Erase_Fail_Count': {'warning': 10, 'critical': 20},
'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100},
'Seek_Error_Rate': {'warning': 50, 'critical': 100},
'Load_Cycle_Count': {'warning': 300000, 'critical': 600000},
'SSD_Life_Left': {'warning': 30, 'critical': 10}
}
try:
# Get detailed SMART data
# Get firmware information
firmware_info = self._check_disk_firmware(device)
if firmware_info['is_problematic']:
smart_health['severity'] = 'WARNING'
smart_health['issues'].extend(firmware_info['known_issues'])
# Get detailed SMART data including performance metrics
result = subprocess.run(
['smartctl', '-A', '-H', '-l', 'error', device],
['smartctl', '-A', '-H', '-l', 'error', '-l', 'background', device],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
@ -377,6 +457,29 @@ class SystemHealthMonitor:
smart_health['severity'] = 'WARNING'
smart_health['issues'].extend(recent_errors)
smart_health['performance_metrics'] = {
'read_speed': None,
'write_speed': None,
'access_time': None
}
# Quick performance test
try:
perf_result = subprocess.run(
['hdparm', '-Tt', device],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
for line in perf_result.stdout.split('\n'):
if 'buffered disk reads' in line:
smart_health['performance_metrics']['read_speed'] = float(line.split()[0])
elif 'cached reads' in line:
smart_health['performance_metrics']['cached_speed'] = float(line.split()[0])
except:
pass # Skip performance metrics if hdparm fails
except Exception as e:
smart_health['status'] = 'ERROR'
smart_health['severity'] = 'UNKNOWN'