From 2be4f9072c0bdda121bc248009d936e653cbd66b Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Mon, 3 Mar 2025 19:14:29 -0500 Subject: [PATCH] Variable descriptions for drive tickets --- hwmonDaemon.py | 115 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 105 insertions(+), 10 deletions(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index 0aeb36d..142edbd 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -150,90 +150,185 @@ class SystemHealthMonitor: # Add SMART attribute explanations SMART_DESCRIPTIONS = { + 'Reported_Uncorrect': """ + Number of errors that could not be recovered using hardware ECC. + Impact: + - Indicates permanent data loss in affected sectors + - High correlation with drive hardware failure + - Critical reliability indicator + + Recommended Actions: + 1. Backup critical data immediately + 2. Check drive logs for related errors + 3. Plan for drive replacement + 4. Monitor for error count increases + """, + 'Reallocated_Sector_Ct': """ Number of sectors that have been reallocated due to errors. + Impact: - High counts indicate degrading media - Each reallocation uses one of the drive's limited spare sectors - Rapid increases suggest accelerating drive wear + + Recommended Actions: + 1. Monitor rate of increase + 2. Check drive temperature + 3. Plan replacement if count grows rapidly """, 'Current_Pending_Sector': """ Sectors waiting to be reallocated due to read/write errors. + Impact: - Indicates potentially unstable sectors - May result in data loss if unrecoverable - Should be monitored for increases + + Recommended Actions: + 1. Backup affected files + 2. Run extended SMART tests + 3. Monitor for conversion to reallocated sectors """, 'Offline_Uncorrectable': """ Count of uncorrectable errors detected during offline data collection. + Impact: - Direct indicator of media reliability issues - May affect data integrity - High values suggest drive replacement needed - """, - 'Reported_Uncorrect': """ - Number of errors that could not be recovered using hardware ECC. - - Critical indicator of drive health - - Directly impacts data reliability - - Any non-zero value requires attention + Recommended Actions: + 1. Run extended SMART tests + 2. Check drive logs + 3. Plan replacement if count is increasing """, 'Spin_Retry_Count': """ Number of spin start retry attempts. + Impact: - Indicates potential motor or bearing issues - May predict imminent mechanical failure - Increasing values suggest degrading drive health + + Recommended Actions: + 1. Monitor for rapid increases + 2. Check drive temperature + 3. Plan replacement if count grows rapidly """, 'Power_On_Hours': """ Total number of hours the device has been powered on. + Impact: - Normal aging metric - Used to gauge overall drive lifetime - Compare against manufacturer's MTBF rating + + Recommended Actions: + 1. Compare to warranty period + 2. Plan replacement if approaching rated lifetime """, 'Media_Wearout_Indicator': """ Percentage of drive's rated life remaining (SSDs). + Impact: - 100 indicates new drive - 0 indicates exceeded rated writes - Critical for SSD lifecycle management + + Recommended Actions: + 1. Plan replacement below 20% + 2. Monitor write workload + 3. Consider workload redistribution """, 'Temperature_Celsius': """ Current drive temperature. + Impact: - High temperatures accelerate wear - Optimal range: 20-45°C - Sustained high temps reduce lifespan + + Recommended Actions: + 1. Check system cooling + 2. Verify airflow + 3. Monitor for sustained high temperatures """, 'Available_Spare': """ Percentage of spare blocks remaining (SSDs). + Impact: - Critical for SSD endurance - Low values indicate approaching end-of-life - Rapid decreases suggest excessive writes + + Recommended Actions: + 1. Plan replacement if below 20% + 2. Monitor write patterns + 3. Consider workload changes """, 'Program_Fail_Count': """ Number of flash program operation failures. + Impact: - Indicates NAND cell reliability - Important for SSD health assessment - Increasing values suggest flash degradation + + Recommended Actions: + 1. Monitor rate of increase + 2. Check firmware updates + 3. Plan replacement if rapidly increasing """, 'Erase_Fail_Count': """ Number of flash erase operation failures. + Impact: - Related to NAND block health - Critical for SSD reliability - High counts suggest failing flash blocks + + Recommended Actions: + 1. Monitor count increases + 2. Check firmware version + 3. Plan replacement if count is high + """, + + 'Load_Cycle_Count': """ + Number of power cycles and head load/unload events. + Impact: + - Normal operation metric + - High counts may indicate power management issues + - Compare against rated cycles (typically 600k-1M) + + Recommended Actions: + 1. Review power management settings + 2. Monitor rate of increase + 3. Plan replacement near rated limit + """, + + 'Wear_Leveling_Count': """ + SSD block erase distribution metric. + Impact: + - Indicates wear pattern uniformity + - Higher values show more balanced wear + - Critical for SSD longevity + + Recommended Actions: + 1. Monitor trend over time + 2. Compare with similar drives + 3. Check workload distribution """ } + # Add relevant SMART descriptions + for attr in SMART_DESCRIPTIONS: + if attr in issue: + description += f"\n{attr}:\n{SMART_DESCRIPTIONS[attr]}\n" + if "SMART" in issue: description += """ - SMART (Self-Monitoring, Analysis, and Reporting Technology) issues indicate potential drive reliability problems. - - Reallocated sectors indicate bad blocks that have been remapped - - Pending sectors are potentially failing blocks waiting to be remapped - - Uncorrectable errors indicate data that could not be read + SMART (Self-Monitoring, Analysis, and Reporting Technology) Attribute Details: + - Possible drive failure! """ if "Temperature" in issue: