From b2cae0b6aa5f759c233aa57ecb9f4ed38e6224ec Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Thu, 6 Mar 2025 11:26:55 -0500 Subject: [PATCH] Updated function scoping --- hwmonDaemon.py | 428 +++++++++++++++++++++++++------------------------ 1 file changed, 218 insertions(+), 210 deletions(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index d523f6e..b509d59 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -76,6 +76,182 @@ class SystemHealthMonitor: 'WD141KRYZ': ['02.01A02'] } } + SEVERITY_INDICATORS = { + 'CRITICAL': '🔴', + 'WARNING': '🟡', + 'HEALTHY': '🟢', + 'UNKNOWN': '⚪' + } + SMART_DESCRIPTIONS = { + 'Reported_Uncorrect': """ + Number of errors that could not be recovered using hardware ECC. + Impact: + - Indicates permanent data loss in affected sectors + - High correlation with drive hardware failure + - Critical reliability indicator + + Recommended Actions: + 1. Backup critical data immediately + 2. Check drive logs for related errors + 3. Plan for drive replacement + 4. Monitor for error count increases + """, + + 'Reallocated_Sector_Ct': """ + Number of sectors that have been reallocated due to errors. + Impact: + - High counts indicate degrading media + - Each reallocation uses one of the drive's limited spare sectors + - Rapid increases suggest accelerating drive wear + + Recommended Actions: + 1. Monitor rate of increase + 2. Check drive temperature + 3. Plan replacement if count grows rapidly + """, + + 'Current_Pending_Sector': """ + Sectors waiting to be reallocated due to read/write errors. + Impact: + - Indicates potentially unstable sectors + - May result in data loss if unrecoverable + - Should be monitored for increases + + Recommended Actions: + 1. Backup affected files + 2. Run extended SMART tests + 3. Monitor for conversion to reallocated sectors + """, + + 'Offline_Uncorrectable': """ + Count of uncorrectable errors detected during offline data collection. + Impact: + - Direct indicator of media reliability issues + - May affect data integrity + - High values suggest drive replacement needed + + Recommended Actions: + 1. Run extended SMART tests + 2. Check drive logs + 3. Plan replacement if count is increasing + """, + + 'Spin_Retry_Count': """ + Number of spin start retry attempts. + Impact: + - Indicates potential motor or bearing issues + - May predict imminent mechanical failure + - Increasing values suggest degrading drive health + + Recommended Actions: + 1. Monitor for rapid increases + 2. Check drive temperature + 3. Plan replacement if count grows rapidly + """, + + 'Power_On_Hours': """ + Total number of hours the device has been powered on. + Impact: + - Normal aging metric + - Used to gauge overall drive lifetime + - Compare against manufacturer's MTBF rating + + Recommended Actions: + 1. Compare to warranty period + 2. Plan replacement if approaching rated lifetime + """, + + 'Media_Wearout_Indicator': """ + Percentage of drive's rated life remaining (SSDs). + Impact: + - 100 indicates new drive + - 0 indicates exceeded rated writes + - Critical for SSD lifecycle management + + Recommended Actions: + 1. Plan replacement below 20% + 2. Monitor write workload + 3. Consider workload redistribution + """, + + 'Temperature_Celsius': """ + Current drive temperature. + Impact: + - High temperatures accelerate wear + - Optimal range: 20-45°C + - Sustained high temps reduce lifespan + + Recommended Actions: + 1. Check system cooling + 2. Verify airflow + 3. Monitor for sustained high temperatures + """, + + 'Available_Spare': """ + Percentage of spare blocks remaining (SSDs). + Impact: + - Critical for SSD endurance + - Low values indicate approaching end-of-life + - Rapid decreases suggest excessive writes + + Recommended Actions: + 1. Plan replacement if below 20% + 2. Monitor write patterns + 3. Consider workload changes + """, + + 'Program_Fail_Count': """ + Number of flash program operation failures. + Impact: + - Indicates NAND cell reliability + - Important for SSD health assessment + - Increasing values suggest flash degradation + + Recommended Actions: + 1. Monitor rate of increase + 2. Check firmware updates + 3. Plan replacement if rapidly increasing + """, + + 'Erase_Fail_Count': """ + Number of flash erase operation failures. + Impact: + - Related to NAND block health + - Critical for SSD reliability + - High counts suggest failing flash blocks + + Recommended Actions: + 1. Monitor count increases + 2. Check firmware version + 3. Plan replacement if count is high + """, + + 'Load_Cycle_Count': """ + Number of power cycles and head load/unload events. + Impact: + - Normal operation metric + - High counts may indicate power management issues + - Compare against rated cycles (typically 600k-1M) + + Recommended Actions: + 1. Review power management settings + 2. Monitor rate of increase + 3. Plan replacement near rated limit + """, + + 'Wear_Leveling_Count': """ + SSD block erase distribution metric. + Impact: + - Indicates wear pattern uniformity + - Higher values show more balanced wear + - Critical for SSD longevity + + Recommended Actions: + 1. Monitor trend over time + 2. Compare with similar drives + 3. Check workload distribution + """ + } def __init__(self, ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php', @@ -186,19 +362,52 @@ class SystemHealthMonitor: return drive_details + STANDARD_WIDTH = 80 + + def make_box(title: str, content: str) -> str: + return f""" +┏━ {title} {'━' * (content_width - len(title) - 3)}┓ +{content} +┗{'━' * content_width}┛""" + + # Format each section using the consistent width + sections = { + 'DRIVE SPECIFICATIONS': ..., + 'SMART STATUS': ..., + 'PARTITION INFO': ... + } + + # Each content line should pad to content_width + for section, content in sections.items(): + formatted_content = '\n'.join(f"┃ {line:<{content_width-2}}┃" for line in content.split('\n')) + description += make_box(section, formatted_content) + + def _get_issue_type(self, issue: str) -> str: + if "SMART" in issue: + return "SMART Health Issue" + elif "Drive" in issue: + return "Storage Issue" + elif "ECC" in issue: + return "Memory Issue" + elif "CPU" in issue: + return "Performance Issue" + elif "Network" in issue: + return "Network Issue" + return "Hardware Issue" + + def _get_impact_level(self, issue: str) -> str: + if "CRITICAL" in issue or "UNHEALTHY" in issue: + return "🔴 Critical - Immediate Action Required" + elif "WARNING" in issue: + return "🟡 Warning - Action Needed Soon" + return "🟢 Low - Monitor Only" + def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any]) -> str: hostname = socket.gethostname() timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") priority = "⚠ HIGH" if "CRITICAL" in issue else "● MEDIUM" - # Calculate maximum width based on content - content_width = max( - len(hostname), - len(timestamp), - len(priority), - len("HARDWARE MONITORING ALERT TICKET") - ) + 10 # Add padding - + content_width = STANDARD_WIDTH - 2 banner = f""" ┏{'━' * content_width}┓ ┃{' HARDWARE MONITORING ALERT TICKET '.center(content_width)}┃ @@ -206,208 +415,7 @@ class SystemHealthMonitor: ┃ Host : {hostname:<{content_width-13}}┃ ┃ Generated : {timestamp:<{content_width-13}}┃ ┃ Priority : {priority:<{content_width-13}}┃ -┗{'━' * content_width}┛ - """ - - description = banner + "\n" + "┏━ ISSUE SUMMARY " + "━" * 50 + "\n" + issue + "\n\n" - - # Add SMART attribute explanations - SMART_DESCRIPTIONS = { - 'Reported_Uncorrect': """ - Number of errors that could not be recovered using hardware ECC. - Impact: - - Indicates permanent data loss in affected sectors - - High correlation with drive hardware failure - - Critical reliability indicator - - Recommended Actions: - 1. Backup critical data immediately - 2. Check drive logs for related errors - 3. Plan for drive replacement - 4. Monitor for error count increases - """, - - 'Reallocated_Sector_Ct': """ - Number of sectors that have been reallocated due to errors. - Impact: - - High counts indicate degrading media - - Each reallocation uses one of the drive's limited spare sectors - - Rapid increases suggest accelerating drive wear - - Recommended Actions: - 1. Monitor rate of increase - 2. Check drive temperature - 3. Plan replacement if count grows rapidly - """, - - 'Current_Pending_Sector': """ - Sectors waiting to be reallocated due to read/write errors. - Impact: - - Indicates potentially unstable sectors - - May result in data loss if unrecoverable - - Should be monitored for increases - - Recommended Actions: - 1. Backup affected files - 2. Run extended SMART tests - 3. Monitor for conversion to reallocated sectors - """, - - 'Offline_Uncorrectable': """ - Count of uncorrectable errors detected during offline data collection. - Impact: - - Direct indicator of media reliability issues - - May affect data integrity - - High values suggest drive replacement needed - - Recommended Actions: - 1. Run extended SMART tests - 2. Check drive logs - 3. Plan replacement if count is increasing - """, - - 'Spin_Retry_Count': """ - Number of spin start retry attempts. - Impact: - - Indicates potential motor or bearing issues - - May predict imminent mechanical failure - - Increasing values suggest degrading drive health - - Recommended Actions: - 1. Monitor for rapid increases - 2. Check drive temperature - 3. Plan replacement if count grows rapidly - """, - - 'Power_On_Hours': """ - Total number of hours the device has been powered on. - Impact: - - Normal aging metric - - Used to gauge overall drive lifetime - - Compare against manufacturer's MTBF rating - - Recommended Actions: - 1. Compare to warranty period - 2. Plan replacement if approaching rated lifetime - """, - - 'Media_Wearout_Indicator': """ - Percentage of drive's rated life remaining (SSDs). - Impact: - - 100 indicates new drive - - 0 indicates exceeded rated writes - - Critical for SSD lifecycle management - - Recommended Actions: - 1. Plan replacement below 20% - 2. Monitor write workload - 3. Consider workload redistribution - """, - - 'Temperature_Celsius': """ - Current drive temperature. - Impact: - - High temperatures accelerate wear - - Optimal range: 20-45°C - - Sustained high temps reduce lifespan - - Recommended Actions: - 1. Check system cooling - 2. Verify airflow - 3. Monitor for sustained high temperatures - """, - - 'Available_Spare': """ - Percentage of spare blocks remaining (SSDs). - Impact: - - Critical for SSD endurance - - Low values indicate approaching end-of-life - - Rapid decreases suggest excessive writes - - Recommended Actions: - 1. Plan replacement if below 20% - 2. Monitor write patterns - 3. Consider workload changes - """, - - 'Program_Fail_Count': """ - Number of flash program operation failures. - Impact: - - Indicates NAND cell reliability - - Important for SSD health assessment - - Increasing values suggest flash degradation - - Recommended Actions: - 1. Monitor rate of increase - 2. Check firmware updates - 3. Plan replacement if rapidly increasing - """, - - 'Erase_Fail_Count': """ - Number of flash erase operation failures. - Impact: - - Related to NAND block health - - Critical for SSD reliability - - High counts suggest failing flash blocks - - Recommended Actions: - 1. Monitor count increases - 2. Check firmware version - 3. Plan replacement if count is high - """, - - 'Load_Cycle_Count': """ - Number of power cycles and head load/unload events. - Impact: - - Normal operation metric - - High counts may indicate power management issues - - Compare against rated cycles (typically 600k-1M) - - Recommended Actions: - 1. Review power management settings - 2. Monitor rate of increase - 3. Plan replacement near rated limit - """, - - 'Wear_Leveling_Count': """ - SSD block erase distribution metric. - Impact: - - Indicates wear pattern uniformity - - Higher values show more balanced wear - - Critical for SSD longevity - - Recommended Actions: - 1. Monitor trend over time - 2. Compare with similar drives - 3. Check workload distribution - """ - } - SEVERITY_INDICATORS = { - 'CRITICAL': '🔴', - 'WARNING': '🟡', - 'HEALTHY': '🟢', - 'UNKNOWN': '⚪' -} - - def _get_issue_type(self, issue: str) -> str: - if "SMART" in issue: - return "SMART Health Issue" - elif "Drive" in issue: - return "Storage Issue" - elif "ECC" in issue: - return "Memory Issue" - elif "CPU" in issue: - return "Performance Issue" - elif "Network" in issue: - return "Network Issue" - return "Hardware Issue" - - def _get_impact_level(self, issue: str) -> str: - if "CRITICAL" in issue or "UNHEALTHY" in issue: - return "🔴 Critical - Immediate Action Required" - elif "WARNING" in issue: - return "🟡 Warning - Action Needed Soon" - return "🟢 Low - Monitor Only" +┗{'━' * content_width}┛""" executive_summary = f""" ┏━ EXECUTIVE SUMMARY ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓