Testing manufacturer specific smart tests
This commit is contained in:
237
hwmonDaemon.py
237
hwmonDaemon.py
@ -107,6 +107,81 @@ class SystemHealthMonitor:
|
|||||||
'WD141KRYZ': ['02.01A02']
|
'WD141KRYZ': ['02.01A02']
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
MANUFACTURER_SMART_PROFILES = {
|
||||||
|
'Ridata': {
|
||||||
|
'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK'],
|
||||||
|
'wear_leveling_behavior': 'countup', # Based on your data, it counts up
|
||||||
|
'wear_leveling_baseline': 0,
|
||||||
|
'wear_leveling_thresholds': {
|
||||||
|
'warning': 500000, # Much higher threshold for countup behavior
|
||||||
|
'critical': 1000000 # Very high threshold
|
||||||
|
},
|
||||||
|
'attributes': {
|
||||||
|
'Wear_Leveling_Count': {
|
||||||
|
'behavior': 'countup',
|
||||||
|
'baseline': 0,
|
||||||
|
'warning_threshold': 500000,
|
||||||
|
'critical_threshold': 1000000,
|
||||||
|
'description': 'Total wear leveling operations performed (countup from 0)',
|
||||||
|
'ignore_on_new_drive': True # Don't alert on new drives
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'Samsung': {
|
||||||
|
'aliases': ['Samsung', 'SAMSUNG'],
|
||||||
|
'wear_leveling_behavior': 'countup',
|
||||||
|
'wear_leveling_baseline': 0,
|
||||||
|
'wear_leveling_thresholds': {
|
||||||
|
'warning': 2000,
|
||||||
|
'critical': 3000
|
||||||
|
},
|
||||||
|
'attributes': {
|
||||||
|
'Wear_Leveling_Count': {
|
||||||
|
'behavior': 'countup',
|
||||||
|
'baseline': 0,
|
||||||
|
'warning_threshold': 2000,
|
||||||
|
'critical_threshold': 3000,
|
||||||
|
'description': 'Total wear leveling operations performed'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'Intel': {
|
||||||
|
'aliases': ['Intel', 'INTEL'],
|
||||||
|
'wear_leveling_behavior': 'percentage',
|
||||||
|
'wear_leveling_baseline': 100,
|
||||||
|
'wear_leveling_thresholds': {
|
||||||
|
'warning': 30,
|
||||||
|
'critical': 10
|
||||||
|
},
|
||||||
|
'attributes': {
|
||||||
|
'Media_Wearout_Indicator': {
|
||||||
|
'behavior': 'countdown',
|
||||||
|
'baseline': 100,
|
||||||
|
'warning_threshold': 30,
|
||||||
|
'critical_threshold': 10,
|
||||||
|
'description': 'Percentage of rated life remaining'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'Micron': {
|
||||||
|
'aliases': ['Micron', 'MICRON', 'Crucial', 'CRUCIAL'],
|
||||||
|
'wear_leveling_behavior': 'percentage',
|
||||||
|
'wear_leveling_baseline': 100,
|
||||||
|
'wear_leveling_thresholds': {
|
||||||
|
'warning': 30,
|
||||||
|
'critical': 10
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'Generic': { # Fallback for unknown manufacturers
|
||||||
|
'aliases': ['Unknown', 'Generic'],
|
||||||
|
'wear_leveling_behavior': 'unknown',
|
||||||
|
'wear_leveling_baseline': None,
|
||||||
|
'wear_leveling_thresholds': {
|
||||||
|
'warning': None, # Don't trigger on unknown
|
||||||
|
'critical': None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
SEVERITY_INDICATORS = {
|
SEVERITY_INDICATORS = {
|
||||||
'CRITICAL': '🔴',
|
'CRITICAL': '🔴',
|
||||||
'WARNING': '🟡',
|
'WARNING': '🟡',
|
||||||
@ -274,13 +349,18 @@ class SystemHealthMonitor:
|
|||||||
SSD block erase distribution metric.
|
SSD block erase distribution metric.
|
||||||
Impact:
|
Impact:
|
||||||
- Indicates wear pattern uniformity
|
- Indicates wear pattern uniformity
|
||||||
- Higher values show more balanced wear
|
- Interpretation varies by manufacturer
|
||||||
- Critical for SSD longevity
|
- Critical for SSD longevity
|
||||||
|
|
||||||
Recommended Actions:
|
Recommended Actions:
|
||||||
1. Monitor trend over time
|
1. Monitor trend over time
|
||||||
2. Compare with similar drives
|
2. Compare with manufacturer baseline
|
||||||
3. Check workload distribution
|
3. Check workload distribution
|
||||||
|
|
||||||
|
Note: Different manufacturers use different counting methods:
|
||||||
|
- Some count up from 0 (Samsung, etc.)
|
||||||
|
- Others count down from baseline (Ridata, etc.)
|
||||||
|
- Always check manufacturer specifications
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -983,49 +1063,56 @@ class SystemHealthMonitor:
|
|||||||
logger.debug(f"Could not parse SMART value: {raw_value}")
|
logger.debug(f"Could not parse SMART value: {raw_value}")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
def _get_manufacturer_profile(self, model: str, manufacturer: str = None) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get manufacturer-specific SMART profile based on drive model/manufacturer.
|
||||||
|
"""
|
||||||
|
# Check each manufacturer profile
|
||||||
|
for mfg, profile in self.MANUFACTURER_SMART_PROFILES.items():
|
||||||
|
for alias in profile['aliases']:
|
||||||
|
if alias.lower() in model.lower() or (manufacturer and alias.lower() in manufacturer.lower()):
|
||||||
|
logger.debug(f"Matched manufacturer profile: {mfg} for model: {model}")
|
||||||
|
return profile
|
||||||
|
|
||||||
|
# Return generic profile if no match
|
||||||
|
logger.debug(f"No specific profile found for {model}, using Generic profile")
|
||||||
|
return self.MANUFACTURER_SMART_PROFILES['Generic']
|
||||||
|
|
||||||
|
def _is_new_drive(self, power_on_hours: int) -> bool:
|
||||||
|
"""
|
||||||
|
Determine if a drive is considered "new" based on power-on hours.
|
||||||
|
"""
|
||||||
|
return power_on_hours < 168 # Less than 1 week of runtime
|
||||||
|
|
||||||
def _check_smart_health(self, device: str) -> Dict[str, Any]:
|
def _check_smart_health(self, device: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Enhanced SMART health check with detailed failure thresholds.
|
Enhanced SMART health check with manufacturer-specific thresholds.
|
||||||
"""
|
"""
|
||||||
smart_health = {
|
smart_health = {
|
||||||
'status': 'HEALTHY',
|
'status': 'HEALTHY',
|
||||||
'severity': 'NORMAL',
|
'severity': 'NORMAL',
|
||||||
'issues': [],
|
'issues': [],
|
||||||
'temp': None,
|
'temp': None,
|
||||||
'attributes': {}
|
'attributes': {},
|
||||||
}
|
'manufacturer_profile': None
|
||||||
|
|
||||||
# Define critical SMART attributes and their thresholds
|
|
||||||
SMART_THRESHOLDS = {
|
|
||||||
'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10},
|
|
||||||
'Current_Pending_Sector': {'warning': 1, 'critical': 5},
|
|
||||||
'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
|
|
||||||
'Reported_Uncorrect': {'warning': 1, 'critical': 10},
|
|
||||||
'Spin_Retry_Count': {'warning': 1, 'critical': 5},
|
|
||||||
# 'Command_Timeout': {'warning': 5, 'critical': 10}, # Removed
|
|
||||||
'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
|
|
||||||
'Power_On_Hours': {'warning': 61320, 'critical': 70080}, # ~7-8 years
|
|
||||||
'Media_Wearout_Indicator': {'warning': 30, 'critical': 10},
|
|
||||||
'Temperature_Celsius': {'warning': 65, 'critical': 75},
|
|
||||||
'Host_Writes_32MiB': {'warning': 50000000, 'critical': 100000000},
|
|
||||||
'Wear_Leveling_Count': {'warning': 2000, 'critical': 3000},
|
|
||||||
'Available_Spare': {'warning': 30, 'critical': 10},
|
|
||||||
'Program_Fail_Count': {'warning': 10, 'critical': 20},
|
|
||||||
'Erase_Fail_Count': {'warning': 10, 'critical': 20},
|
|
||||||
# 'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100}, # Removed
|
|
||||||
# 'Seek_Error_Rate': {'warning': 50, 'critical': 100}, # Removed
|
|
||||||
'Load_Cycle_Count': {'warning': 900000, 'critical': 1000000},
|
|
||||||
'SSD_Life_Left': {'warning': 30, 'critical': 10}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Get drive details first to determine manufacturer
|
||||||
|
drive_details = self._get_drive_details(device)
|
||||||
|
manufacturer_profile = self._get_manufacturer_profile(
|
||||||
|
drive_details.get('model', ''),
|
||||||
|
drive_details.get('manufacturer', '')
|
||||||
|
)
|
||||||
|
smart_health['manufacturer_profile'] = manufacturer_profile
|
||||||
|
|
||||||
# Get firmware information
|
# Get firmware information
|
||||||
firmware_info = self._check_disk_firmware(device)
|
firmware_info = self._check_disk_firmware(device)
|
||||||
if firmware_info['is_problematic']:
|
if firmware_info['is_problematic']:
|
||||||
smart_health['severity'] = 'WARNING'
|
smart_health['severity'] = 'WARNING'
|
||||||
smart_health['issues'].extend(firmware_info['known_issues'])
|
smart_health['issues'].extend(firmware_info['known_issues'])
|
||||||
|
|
||||||
# Get detailed SMART data including performance metrics
|
# Get detailed SMART data
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['smartctl', '-A', '-H', '-l', 'error', '-l', 'background', device],
|
['smartctl', '-A', '-H', '-l', 'error', '-l', 'background', device],
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
@ -1041,25 +1128,81 @@ class SystemHealthMonitor:
|
|||||||
smart_health['severity'] = 'CRITICAL'
|
smart_health['severity'] = 'CRITICAL'
|
||||||
smart_health['issues'].append("SMART overall health check failed")
|
smart_health['issues'].append("SMART overall health check failed")
|
||||||
|
|
||||||
# Parse SMART attributes with thresholds
|
# Parse SMART attributes with manufacturer-specific handling
|
||||||
for line in output.split('\n'):
|
power_on_hours = 0
|
||||||
if 'Reported_Uncorrect' in line:
|
|
||||||
parts = line.split()
|
|
||||||
raw_value = self._parse_smart_value(parts[9])
|
|
||||||
logger.debug(f"Found Reported_Uncorrect value: {raw_value}")
|
|
||||||
smart_health['attributes']['Reported_Uncorrect'] = raw_value
|
|
||||||
|
|
||||||
if raw_value >= SMART_THRESHOLDS['Reported_Uncorrect']['critical']:
|
for line in output.split('\n'):
|
||||||
smart_health['status'] = 'UNHEALTHY'
|
# Extract Power_On_Hours first to determine if drive is new
|
||||||
|
if 'Power_On_Hours' in line:
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 10:
|
||||||
|
power_on_hours = self._parse_smart_value(parts[9])
|
||||||
|
smart_health['attributes']['Power_On_Hours'] = power_on_hours
|
||||||
|
|
||||||
|
# Check if this is a new drive
|
||||||
|
is_new_drive = self._is_new_drive(power_on_hours)
|
||||||
|
logger.debug(f"Drive {device} power-on hours: {power_on_hours}, is_new_drive: {is_new_drive}")
|
||||||
|
|
||||||
|
# Define base SMART thresholds (for non-manufacturer specific attributes)
|
||||||
|
BASE_SMART_THRESHOLDS = {
|
||||||
|
'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10},
|
||||||
|
'Current_Pending_Sector': {'warning': 1, 'critical': 5},
|
||||||
|
'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
|
||||||
|
'Reported_Uncorrect': {'warning': 1, 'critical': 10},
|
||||||
|
'Spin_Retry_Count': {'warning': 1, 'critical': 5},
|
||||||
|
'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
|
||||||
|
'Power_On_Hours': {'warning': 61320, 'critical': 70080}, # ~7-8 years
|
||||||
|
'Temperature_Celsius': {'warning': 65, 'critical': 75},
|
||||||
|
'Available_Spare': {'warning': 30, 'critical': 10},
|
||||||
|
'Program_Fail_Count': {'warning': 10, 'critical': 20},
|
||||||
|
'Erase_Fail_Count': {'warning': 10, 'critical': 20},
|
||||||
|
'Load_Cycle_Count': {'warning': 900000, 'critical': 1000000},
|
||||||
|
'SSD_Life_Left': {'warning': 30, 'critical': 10}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Parse all SMART attributes
|
||||||
|
for line in output.split('\n'):
|
||||||
|
# Handle manufacturer-specific Wear_Leveling_Count
|
||||||
|
if 'Wear_Leveling_Count' in line:
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 10:
|
||||||
|
raw_value = self._parse_smart_value(parts[9])
|
||||||
|
smart_health['attributes']['Wear_Leveling_Count'] = raw_value
|
||||||
|
|
||||||
|
# Get manufacturer-specific thresholds
|
||||||
|
wear_attr = manufacturer_profile.get('attributes', {}).get('Wear_Leveling_Count', {})
|
||||||
|
|
||||||
|
# Skip evaluation if this is a new drive and manufacturer profile says to ignore
|
||||||
|
if is_new_drive and wear_attr.get('ignore_on_new_drive', False):
|
||||||
|
logger.debug(f"Skipping Wear_Leveling_Count evaluation for new drive: {raw_value}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
warning_threshold = wear_attr.get('warning_threshold')
|
||||||
|
critical_threshold = wear_attr.get('critical_threshold')
|
||||||
|
|
||||||
|
if warning_threshold and critical_threshold:
|
||||||
|
behavior = wear_attr.get('behavior', 'countup')
|
||||||
|
|
||||||
|
if behavior == 'countup':
|
||||||
|
if raw_value >= critical_threshold:
|
||||||
smart_health['severity'] = 'CRITICAL'
|
smart_health['severity'] = 'CRITICAL'
|
||||||
smart_health['issues'].append(f"Critical uncorrectable errors: {raw_value}")
|
smart_health['issues'].append(f"Critical wear leveling count: {raw_value}")
|
||||||
elif raw_value >= SMART_THRESHOLDS['Reported_Uncorrect']['warning']:
|
elif raw_value >= warning_threshold:
|
||||||
if smart_health['severity'] != 'CRITICAL':
|
if smart_health['severity'] != 'CRITICAL':
|
||||||
smart_health['severity'] = 'WARNING'
|
smart_health['severity'] = 'WARNING'
|
||||||
smart_health['issues'].append(f"Warning: uncorrectable errors detected: {raw_value}")
|
smart_health['issues'].append(f"High wear leveling count: {raw_value}")
|
||||||
|
elif behavior == 'countdown':
|
||||||
|
if raw_value <= critical_threshold:
|
||||||
|
smart_health['severity'] = 'CRITICAL'
|
||||||
|
smart_health['issues'].append(f"Critical wear leveling remaining: {raw_value}")
|
||||||
|
elif raw_value <= warning_threshold:
|
||||||
|
if smart_health['severity'] != 'CRITICAL':
|
||||||
|
smart_health['severity'] = 'WARNING'
|
||||||
|
smart_health['issues'].append(f"Low wear leveling remaining: {raw_value}")
|
||||||
|
|
||||||
for attr, thresholds in SMART_THRESHOLDS.items():
|
# Handle all other standard SMART attributes
|
||||||
if attr in line:
|
for attr, thresholds in BASE_SMART_THRESHOLDS.items():
|
||||||
|
if attr in line and attr != 'Wear_Leveling_Count': # Skip wear leveling as it's handled above
|
||||||
parts = line.split()
|
parts = line.split()
|
||||||
if len(parts) >= 10:
|
if len(parts) >= 10:
|
||||||
raw_value = self._parse_smart_value(parts[9])
|
raw_value = self._parse_smart_value(parts[9])
|
||||||
@ -1071,6 +1214,7 @@ class SystemHealthMonitor:
|
|||||||
smart_health['severity'] = 'CRITICAL'
|
smart_health['severity'] = 'CRITICAL'
|
||||||
smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
|
smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
|
||||||
elif raw_value >= thresholds['warning']:
|
elif raw_value >= thresholds['warning']:
|
||||||
|
if smart_health['severity'] != 'CRITICAL':
|
||||||
smart_health['severity'] = 'WARNING'
|
smart_health['severity'] = 'WARNING'
|
||||||
smart_health['issues'].append(f"High temperature: {raw_value}°C")
|
smart_health['issues'].append(f"High temperature: {raw_value}°C")
|
||||||
else:
|
else:
|
||||||
@ -1098,10 +1242,12 @@ class SystemHealthMonitor:
|
|||||||
smart_health['issues'].extend(recent_errors)
|
smart_health['issues'].extend(recent_errors)
|
||||||
|
|
||||||
logger.debug(f"=== SMART Health Check for {device} ===")
|
logger.debug(f"=== SMART Health Check for {device} ===")
|
||||||
|
logger.debug(f"Manufacturer profile: {manufacturer_profile.get('aliases', ['Unknown'])[0]}")
|
||||||
logger.debug("Raw SMART attributes:")
|
logger.debug("Raw SMART attributes:")
|
||||||
for attr, value in smart_health['attributes'].items():
|
for attr, value in smart_health['attributes'].items():
|
||||||
logger.debug(f"{attr}: {value}")
|
logger.debug(f"{attr}: {value}")
|
||||||
logger.debug(f"Temperature: {smart_health['temp']}°C")
|
logger.debug(f"Temperature: {smart_health['temp']}°C")
|
||||||
|
logger.debug(f"Is new drive: {is_new_drive}")
|
||||||
logger.debug(f"Detected Issues: {smart_health['issues']}")
|
logger.debug(f"Detected Issues: {smart_health['issues']}")
|
||||||
logger.debug("=== End SMART Check ===\n")
|
logger.debug("=== End SMART Check ===\n")
|
||||||
|
|
||||||
@ -1408,7 +1554,6 @@ class SystemHealthMonitor:
|
|||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logger.debug("Executing 'pct list' command")
|
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['pct', 'list'],
|
['pct', 'list'],
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
@ -1418,7 +1563,6 @@ class SystemHealthMonitor:
|
|||||||
logger.debug(f"pct list output:\n{result.stdout}")
|
logger.debug(f"pct list output:\n{result.stdout}")
|
||||||
|
|
||||||
for line in result.stdout.split('\n')[1:]:
|
for line in result.stdout.split('\n')[1:]:
|
||||||
logger.debug(f"Raw LXC line: {line}")
|
|
||||||
if not line.strip():
|
if not line.strip():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -1428,7 +1572,6 @@ class SystemHealthMonitor:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
vmid, status = parts[0], parts[1]
|
vmid, status = parts[0], parts[1]
|
||||||
logger.debug(f"Processing container VMID: {vmid}, Status: {status}")
|
|
||||||
|
|
||||||
if status.lower() == 'running':
|
if status.lower() == 'running':
|
||||||
logger.debug(f"Checking container {vmid} disk usage")
|
logger.debug(f"Checking container {vmid} disk usage")
|
||||||
@ -1438,7 +1581,6 @@ class SystemHealthMonitor:
|
|||||||
stderr=subprocess.PIPE,
|
stderr=subprocess.PIPE,
|
||||||
text=True
|
text=True
|
||||||
)
|
)
|
||||||
logger.debug(f"Raw disk info output:\n{disk_info.stdout}")
|
|
||||||
|
|
||||||
container_info = {
|
container_info = {
|
||||||
'vmid': vmid,
|
'vmid': vmid,
|
||||||
@ -1447,11 +1589,8 @@ class SystemHealthMonitor:
|
|||||||
|
|
||||||
for fs_line in disk_info.stdout.split('\n')[1:]:
|
for fs_line in disk_info.stdout.split('\n')[1:]:
|
||||||
if not fs_line.strip() or 'MP' in fs_line:
|
if not fs_line.strip() or 'MP' in fs_line:
|
||||||
logger.debug(f"Skipping line: {fs_line}")
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
logger.debug(f"Processing filesystem line: {fs_line}")
|
|
||||||
#parts = fs_line.split()
|
|
||||||
columns = line.split()
|
columns = line.split()
|
||||||
logger.debug(f"Split parts: {parts}")
|
logger.debug(f"Split parts: {parts}")
|
||||||
if len(columns) >= 6:
|
if len(columns) >= 6:
|
||||||
|
|||||||
Reference in New Issue
Block a user