Updated function scoping

2025-03-06 11:26:55 -05:00
parent e3e0c73630
commit b2cae0b6aa
1 changed files with 218 additions and 210 deletions
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -76,6 +76,182 @@ class SystemHealthMonitor:
            'WD141KRYZ': ['02.01A02']
        }
    }
    SEVERITY_INDICATORS = {
        'CRITICAL': '🔴',
        'WARNING': '🟡',
        'HEALTHY': '🟢',
        'UNKNOWN': '⚪'
    }
    SMART_DESCRIPTIONS = {
        'Reported_Uncorrect': """
        Number of errors that could not be recovered using hardware ECC.
        Impact:
        - Indicates permanent data loss in affected sectors
        - High correlation with drive hardware failure
        - Critical reliability indicator
        Recommended Actions:
        1. Backup critical data immediately
        2. Check drive logs for related errors
        3. Plan for drive replacement
        4. Monitor for error count increases
        """,
        'Reallocated_Sector_Ct': """
        Number of sectors that have been reallocated due to errors.
        Impact:
        - High counts indicate degrading media
        - Each reallocation uses one of the drive's limited spare sectors
        - Rapid increases suggest accelerating drive wear
        Recommended Actions:
        1. Monitor rate of increase
        2. Check drive temperature
        3. Plan replacement if count grows rapidly
        """,
        'Current_Pending_Sector': """
        Sectors waiting to be reallocated due to read/write errors.
        Impact:
        - Indicates potentially unstable sectors
        - May result in data loss if unrecoverable
        - Should be monitored for increases
        Recommended Actions:
        1. Backup affected files
        2. Run extended SMART tests
        3. Monitor for conversion to reallocated sectors
        """,
        'Offline_Uncorrectable': """
        Count of uncorrectable errors detected during offline data collection.
        Impact:
        - Direct indicator of media reliability issues
        - May affect data integrity
        - High values suggest drive replacement needed
        Recommended Actions:
        1. Run extended SMART tests
        2. Check drive logs
        3. Plan replacement if count is increasing
        """,
        'Spin_Retry_Count': """
        Number of spin start retry attempts.
        Impact:
        - Indicates potential motor or bearing issues
        - May predict imminent mechanical failure
        - Increasing values suggest degrading drive health
        Recommended Actions:
        1. Monitor for rapid increases
        2. Check drive temperature
        3. Plan replacement if count grows rapidly
        """,
        'Power_On_Hours': """
        Total number of hours the device has been powered on.
        Impact:
        - Normal aging metric
        - Used to gauge overall drive lifetime
        - Compare against manufacturer's MTBF rating
        Recommended Actions:
        1. Compare to warranty period
        2. Plan replacement if approaching rated lifetime
        """,
        'Media_Wearout_Indicator': """
        Percentage of drive's rated life remaining (SSDs).
        Impact:
        - 100 indicates new drive
        - 0 indicates exceeded rated writes
        - Critical for SSD lifecycle management
        Recommended Actions:
        1. Plan replacement below 20%
        2. Monitor write workload
        3. Consider workload redistribution
        """,
        'Temperature_Celsius': """
        Current drive temperature.
        Impact:
        - High temperatures accelerate wear
        - Optimal range: 20-45°C
        - Sustained high temps reduce lifespan
        Recommended Actions:
        1. Check system cooling
        2. Verify airflow
        3. Monitor for sustained high temperatures
        """,
        'Available_Spare': """
        Percentage of spare blocks remaining (SSDs).
        Impact:
        - Critical for SSD endurance
        - Low values indicate approaching end-of-life
        - Rapid decreases suggest excessive writes
        Recommended Actions:
        1. Plan replacement if below 20%
        2. Monitor write patterns
        3. Consider workload changes
        """,
        'Program_Fail_Count': """
        Number of flash program operation failures.
        Impact:
        - Indicates NAND cell reliability
        - Important for SSD health assessment
        - Increasing values suggest flash degradation
        Recommended Actions:
        1. Monitor rate of increase
        2. Check firmware updates
        3. Plan replacement if rapidly increasing
        """,
        'Erase_Fail_Count': """
        Number of flash erase operation failures.
        Impact:
        - Related to NAND block health
        - Critical for SSD reliability
        - High counts suggest failing flash blocks
        Recommended Actions:
        1. Monitor count increases
        2. Check firmware version
        3. Plan replacement if count is high
        """,
        'Load_Cycle_Count': """
        Number of power cycles and head load/unload events.
        Impact:
        - Normal operation metric
        - High counts may indicate power management issues
        - Compare against rated cycles (typically 600k-1M)
        Recommended Actions:
        1. Review power management settings
        2. Monitor rate of increase
        3. Plan replacement near rated limit
        """,
        'Wear_Leveling_Count': """
        SSD block erase distribution metric.
        Impact:
        - Indicates wear pattern uniformity
        - Higher values show more balanced wear
        - Critical for SSD longevity
        Recommended Actions:
        1. Monitor trend over time
        2. Compare with similar drives
        3. Check workload distribution
        """
    }
    def __init__(self, 
                 ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
@@ -186,19 +362,52 @@ class SystemHealthMonitor:
        return drive_details
    STANDARD_WIDTH = 80
    def make_box(title: str, content: str) -> str:
        return f"""
 ┏━ {title} {'━' * (content_width - len(title) - 3)}┓
 {content}
 ┗{'━' * content_width}┛"""
        # Format each section using the consistent width
        sections = {
            'DRIVE SPECIFICATIONS': ...,
            'SMART STATUS': ...,
            'PARTITION INFO': ...
        }
        # Each content line should pad to content_width
        for section, content in sections.items():
            formatted_content = '\n'.join(f"┃ {line:<{content_width-2}}┃" for line in content.split('\n'))
            description += make_box(section, formatted_content)
    def _get_issue_type(self, issue: str) -> str:
        if "SMART" in issue:
            return "SMART Health Issue"
        elif "Drive" in issue:
            return "Storage Issue"
        elif "ECC" in issue:
            return "Memory Issue" 
        elif "CPU" in issue:
            return "Performance Issue"
        elif "Network" in issue:
            return "Network Issue"
        return "Hardware Issue"
    def _get_impact_level(self, issue: str) -> str:
        if "CRITICAL" in issue or "UNHEALTHY" in issue:
            return "🔴 Critical - Immediate Action Required"
        elif "WARNING" in issue:
            return "🟡 Warning - Action Needed Soon"
        return "🟢 Low - Monitor Only"
    def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any]) -> str:
        hostname = socket.gethostname()
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        priority = "⚠ HIGH" if "CRITICAL" in issue else "● MEDIUM"
-        # Calculate maximum width based on content
+        content_width = STANDARD_WIDTH - 2
        content_width = max(
            len(hostname),
            len(timestamp),
            len(priority),
            len("HARDWARE MONITORING ALERT TICKET")
        ) + 10  # Add padding
        banner = f"""
 ┏{'━' * content_width}┓
 ┃{' HARDWARE MONITORING ALERT TICKET '.center(content_width)}┃
@@ -206,208 +415,7 @@ class SystemHealthMonitor:
 ┃  Host      : {hostname:<{content_width-13}}┃
 ┃  Generated : {timestamp:<{content_width-13}}┃
 ┃  Priority  : {priority:<{content_width-13}}┃
-┗{'━' * content_width}┛
+┗{'━' * content_width}┛"""
            """
        description = banner + "\n" + "┏━ ISSUE SUMMARY " + "━" * 50 + "\n" + issue + "\n\n"
        # Add SMART attribute explanations
        SMART_DESCRIPTIONS = {
            'Reported_Uncorrect': """
            Number of errors that could not be recovered using hardware ECC.
            Impact:
            - Indicates permanent data loss in affected sectors
            - High correlation with drive hardware failure
            - Critical reliability indicator
            Recommended Actions:
            1. Backup critical data immediately
            2. Check drive logs for related errors
            3. Plan for drive replacement
            4. Monitor for error count increases
            """,
            'Reallocated_Sector_Ct': """
            Number of sectors that have been reallocated due to errors.
            Impact:
            - High counts indicate degrading media
            - Each reallocation uses one of the drive's limited spare sectors
            - Rapid increases suggest accelerating drive wear
            Recommended Actions:
            1. Monitor rate of increase
            2. Check drive temperature
            3. Plan replacement if count grows rapidly
            """,
            'Current_Pending_Sector': """
            Sectors waiting to be reallocated due to read/write errors.
            Impact:
            - Indicates potentially unstable sectors
            - May result in data loss if unrecoverable
            - Should be monitored for increases
            Recommended Actions:
            1. Backup affected files
            2. Run extended SMART tests
            3. Monitor for conversion to reallocated sectors
            """,
            'Offline_Uncorrectable': """
            Count of uncorrectable errors detected during offline data collection.
            Impact:
            - Direct indicator of media reliability issues
            - May affect data integrity
            - High values suggest drive replacement needed
            Recommended Actions:
            1. Run extended SMART tests
            2. Check drive logs
            3. Plan replacement if count is increasing
            """,
            'Spin_Retry_Count': """
            Number of spin start retry attempts.
            Impact:
            - Indicates potential motor or bearing issues
            - May predict imminent mechanical failure
            - Increasing values suggest degrading drive health
            Recommended Actions:
            1. Monitor for rapid increases
            2. Check drive temperature
            3. Plan replacement if count grows rapidly
            """,
            'Power_On_Hours': """
            Total number of hours the device has been powered on.
            Impact:
            - Normal aging metric
            - Used to gauge overall drive lifetime
            - Compare against manufacturer's MTBF rating
            Recommended Actions:
            1. Compare to warranty period
            2. Plan replacement if approaching rated lifetime
            """,
            'Media_Wearout_Indicator': """
            Percentage of drive's rated life remaining (SSDs).
            Impact:
            - 100 indicates new drive
            - 0 indicates exceeded rated writes
            - Critical for SSD lifecycle management
            Recommended Actions:
            1. Plan replacement below 20%
            2. Monitor write workload
            3. Consider workload redistribution
            """,
            'Temperature_Celsius': """
            Current drive temperature.
            Impact:
            - High temperatures accelerate wear
            - Optimal range: 20-45°C
            - Sustained high temps reduce lifespan
            Recommended Actions:
            1. Check system cooling
            2. Verify airflow
            3. Monitor for sustained high temperatures
            """,
            'Available_Spare': """
            Percentage of spare blocks remaining (SSDs).
            Impact:
            - Critical for SSD endurance
            - Low values indicate approaching end-of-life
            - Rapid decreases suggest excessive writes
            Recommended Actions:
            1. Plan replacement if below 20%
            2. Monitor write patterns
            3. Consider workload changes
            """,
            'Program_Fail_Count': """
            Number of flash program operation failures.
            Impact:
            - Indicates NAND cell reliability
            - Important for SSD health assessment
            - Increasing values suggest flash degradation
            Recommended Actions:
            1. Monitor rate of increase
            2. Check firmware updates
            3. Plan replacement if rapidly increasing
            """,
            'Erase_Fail_Count': """
            Number of flash erase operation failures.
            Impact:
            - Related to NAND block health
            - Critical for SSD reliability
            - High counts suggest failing flash blocks
            Recommended Actions:
            1. Monitor count increases
            2. Check firmware version
            3. Plan replacement if count is high
            """,
            'Load_Cycle_Count': """
            Number of power cycles and head load/unload events.
            Impact:
            - Normal operation metric
            - High counts may indicate power management issues
            - Compare against rated cycles (typically 600k-1M)
            Recommended Actions:
            1. Review power management settings
            2. Monitor rate of increase
            3. Plan replacement near rated limit
            """,
            'Wear_Leveling_Count': """
            SSD block erase distribution metric.
            Impact:
            - Indicates wear pattern uniformity
            - Higher values show more balanced wear
            - Critical for SSD longevity
            Recommended Actions:
            1. Monitor trend over time
            2. Compare with similar drives
            3. Check workload distribution
            """
        }
        SEVERITY_INDICATORS = {
    'CRITICAL': '🔴',
    'WARNING': '🟡',
    'HEALTHY': '🟢',
    'UNKNOWN': '⚪'
 }
        def _get_issue_type(self, issue: str) -> str:
            if "SMART" in issue:
                return "SMART Health Issue"
            elif "Drive" in issue:
                return "Storage Issue"
            elif "ECC" in issue:
                return "Memory Issue" 
            elif "CPU" in issue:
                return "Performance Issue"
            elif "Network" in issue:
                return "Network Issue"
            return "Hardware Issue"
        def _get_impact_level(self, issue: str) -> str:
            if "CRITICAL" in issue or "UNHEALTHY" in issue:
                return "🔴 Critical - Immediate Action Required"
            elif "WARNING" in issue:
                return "🟡 Warning - Action Needed Soon"
            return "🟢 Low - Monitor Only"
        executive_summary = f"""
 ┏━ EXECUTIVE SUMMARY ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓