Updated function scoping

This commit is contained in:
2025-03-06 11:26:55 -05:00
parent e3e0c73630
commit b2cae0b6aa

View File

@ -76,6 +76,182 @@ class SystemHealthMonitor:
'WD141KRYZ': ['02.01A02']
}
}
SEVERITY_INDICATORS = {
'CRITICAL': '🔴',
'WARNING': '🟡',
'HEALTHY': '🟢',
'UNKNOWN': ''
}
SMART_DESCRIPTIONS = {
'Reported_Uncorrect': """
Number of errors that could not be recovered using hardware ECC.
Impact:
- Indicates permanent data loss in affected sectors
- High correlation with drive hardware failure
- Critical reliability indicator
Recommended Actions:
1. Backup critical data immediately
2. Check drive logs for related errors
3. Plan for drive replacement
4. Monitor for error count increases
""",
'Reallocated_Sector_Ct': """
Number of sectors that have been reallocated due to errors.
Impact:
- High counts indicate degrading media
- Each reallocation uses one of the drive's limited spare sectors
- Rapid increases suggest accelerating drive wear
Recommended Actions:
1. Monitor rate of increase
2. Check drive temperature
3. Plan replacement if count grows rapidly
""",
'Current_Pending_Sector': """
Sectors waiting to be reallocated due to read/write errors.
Impact:
- Indicates potentially unstable sectors
- May result in data loss if unrecoverable
- Should be monitored for increases
Recommended Actions:
1. Backup affected files
2. Run extended SMART tests
3. Monitor for conversion to reallocated sectors
""",
'Offline_Uncorrectable': """
Count of uncorrectable errors detected during offline data collection.
Impact:
- Direct indicator of media reliability issues
- May affect data integrity
- High values suggest drive replacement needed
Recommended Actions:
1. Run extended SMART tests
2. Check drive logs
3. Plan replacement if count is increasing
""",
'Spin_Retry_Count': """
Number of spin start retry attempts.
Impact:
- Indicates potential motor or bearing issues
- May predict imminent mechanical failure
- Increasing values suggest degrading drive health
Recommended Actions:
1. Monitor for rapid increases
2. Check drive temperature
3. Plan replacement if count grows rapidly
""",
'Power_On_Hours': """
Total number of hours the device has been powered on.
Impact:
- Normal aging metric
- Used to gauge overall drive lifetime
- Compare against manufacturer's MTBF rating
Recommended Actions:
1. Compare to warranty period
2. Plan replacement if approaching rated lifetime
""",
'Media_Wearout_Indicator': """
Percentage of drive's rated life remaining (SSDs).
Impact:
- 100 indicates new drive
- 0 indicates exceeded rated writes
- Critical for SSD lifecycle management
Recommended Actions:
1. Plan replacement below 20%
2. Monitor write workload
3. Consider workload redistribution
""",
'Temperature_Celsius': """
Current drive temperature.
Impact:
- High temperatures accelerate wear
- Optimal range: 20-45°C
- Sustained high temps reduce lifespan
Recommended Actions:
1. Check system cooling
2. Verify airflow
3. Monitor for sustained high temperatures
""",
'Available_Spare': """
Percentage of spare blocks remaining (SSDs).
Impact:
- Critical for SSD endurance
- Low values indicate approaching end-of-life
- Rapid decreases suggest excessive writes
Recommended Actions:
1. Plan replacement if below 20%
2. Monitor write patterns
3. Consider workload changes
""",
'Program_Fail_Count': """
Number of flash program operation failures.
Impact:
- Indicates NAND cell reliability
- Important for SSD health assessment
- Increasing values suggest flash degradation
Recommended Actions:
1. Monitor rate of increase
2. Check firmware updates
3. Plan replacement if rapidly increasing
""",
'Erase_Fail_Count': """
Number of flash erase operation failures.
Impact:
- Related to NAND block health
- Critical for SSD reliability
- High counts suggest failing flash blocks
Recommended Actions:
1. Monitor count increases
2. Check firmware version
3. Plan replacement if count is high
""",
'Load_Cycle_Count': """
Number of power cycles and head load/unload events.
Impact:
- Normal operation metric
- High counts may indicate power management issues
- Compare against rated cycles (typically 600k-1M)
Recommended Actions:
1. Review power management settings
2. Monitor rate of increase
3. Plan replacement near rated limit
""",
'Wear_Leveling_Count': """
SSD block erase distribution metric.
Impact:
- Indicates wear pattern uniformity
- Higher values show more balanced wear
- Critical for SSD longevity
Recommended Actions:
1. Monitor trend over time
2. Compare with similar drives
3. Check workload distribution
"""
}
def __init__(self,
ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
@ -186,19 +362,52 @@ class SystemHealthMonitor:
return drive_details
STANDARD_WIDTH = 80
def make_box(title: str, content: str) -> str:
return f"""
┏━ {title} {'' * (content_width - len(title) - 3)}
{content}
{'' * content_width}"""
# Format each section using the consistent width
sections = {
'DRIVE SPECIFICATIONS': ...,
'SMART STATUS': ...,
'PARTITION INFO': ...
}
# Each content line should pad to content_width
for section, content in sections.items():
formatted_content = '\n'.join(f"{line:<{content_width-2}}" for line in content.split('\n'))
description += make_box(section, formatted_content)
def _get_issue_type(self, issue: str) -> str:
if "SMART" in issue:
return "SMART Health Issue"
elif "Drive" in issue:
return "Storage Issue"
elif "ECC" in issue:
return "Memory Issue"
elif "CPU" in issue:
return "Performance Issue"
elif "Network" in issue:
return "Network Issue"
return "Hardware Issue"
def _get_impact_level(self, issue: str) -> str:
if "CRITICAL" in issue or "UNHEALTHY" in issue:
return "🔴 Critical - Immediate Action Required"
elif "WARNING" in issue:
return "🟡 Warning - Action Needed Soon"
return "🟢 Low - Monitor Only"
def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any]) -> str:
hostname = socket.gethostname()
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
priority = "⚠ HIGH" if "CRITICAL" in issue else "● MEDIUM"
# Calculate maximum width based on content
content_width = max(
len(hostname),
len(timestamp),
len(priority),
len("HARDWARE MONITORING ALERT TICKET")
) + 10 # Add padding
content_width = STANDARD_WIDTH - 2
banner = f"""
{'' * content_width}
{' HARDWARE MONITORING ALERT TICKET '.center(content_width)}
@ -206,208 +415,7 @@ class SystemHealthMonitor:
┃ Host : {hostname:<{content_width-13}}
┃ Generated : {timestamp:<{content_width-13}}
┃ Priority : {priority:<{content_width-13}}
{'' * content_width}
"""
description = banner + "\n" + "┏━ ISSUE SUMMARY " + "" * 50 + "\n" + issue + "\n\n"
# Add SMART attribute explanations
SMART_DESCRIPTIONS = {
'Reported_Uncorrect': """
Number of errors that could not be recovered using hardware ECC.
Impact:
- Indicates permanent data loss in affected sectors
- High correlation with drive hardware failure
- Critical reliability indicator
Recommended Actions:
1. Backup critical data immediately
2. Check drive logs for related errors
3. Plan for drive replacement
4. Monitor for error count increases
""",
'Reallocated_Sector_Ct': """
Number of sectors that have been reallocated due to errors.
Impact:
- High counts indicate degrading media
- Each reallocation uses one of the drive's limited spare sectors
- Rapid increases suggest accelerating drive wear
Recommended Actions:
1. Monitor rate of increase
2. Check drive temperature
3. Plan replacement if count grows rapidly
""",
'Current_Pending_Sector': """
Sectors waiting to be reallocated due to read/write errors.
Impact:
- Indicates potentially unstable sectors
- May result in data loss if unrecoverable
- Should be monitored for increases
Recommended Actions:
1. Backup affected files
2. Run extended SMART tests
3. Monitor for conversion to reallocated sectors
""",
'Offline_Uncorrectable': """
Count of uncorrectable errors detected during offline data collection.
Impact:
- Direct indicator of media reliability issues
- May affect data integrity
- High values suggest drive replacement needed
Recommended Actions:
1. Run extended SMART tests
2. Check drive logs
3. Plan replacement if count is increasing
""",
'Spin_Retry_Count': """
Number of spin start retry attempts.
Impact:
- Indicates potential motor or bearing issues
- May predict imminent mechanical failure
- Increasing values suggest degrading drive health
Recommended Actions:
1. Monitor for rapid increases
2. Check drive temperature
3. Plan replacement if count grows rapidly
""",
'Power_On_Hours': """
Total number of hours the device has been powered on.
Impact:
- Normal aging metric
- Used to gauge overall drive lifetime
- Compare against manufacturer's MTBF rating
Recommended Actions:
1. Compare to warranty period
2. Plan replacement if approaching rated lifetime
""",
'Media_Wearout_Indicator': """
Percentage of drive's rated life remaining (SSDs).
Impact:
- 100 indicates new drive
- 0 indicates exceeded rated writes
- Critical for SSD lifecycle management
Recommended Actions:
1. Plan replacement below 20%
2. Monitor write workload
3. Consider workload redistribution
""",
'Temperature_Celsius': """
Current drive temperature.
Impact:
- High temperatures accelerate wear
- Optimal range: 20-45°C
- Sustained high temps reduce lifespan
Recommended Actions:
1. Check system cooling
2. Verify airflow
3. Monitor for sustained high temperatures
""",
'Available_Spare': """
Percentage of spare blocks remaining (SSDs).
Impact:
- Critical for SSD endurance
- Low values indicate approaching end-of-life
- Rapid decreases suggest excessive writes
Recommended Actions:
1. Plan replacement if below 20%
2. Monitor write patterns
3. Consider workload changes
""",
'Program_Fail_Count': """
Number of flash program operation failures.
Impact:
- Indicates NAND cell reliability
- Important for SSD health assessment
- Increasing values suggest flash degradation
Recommended Actions:
1. Monitor rate of increase
2. Check firmware updates
3. Plan replacement if rapidly increasing
""",
'Erase_Fail_Count': """
Number of flash erase operation failures.
Impact:
- Related to NAND block health
- Critical for SSD reliability
- High counts suggest failing flash blocks
Recommended Actions:
1. Monitor count increases
2. Check firmware version
3. Plan replacement if count is high
""",
'Load_Cycle_Count': """
Number of power cycles and head load/unload events.
Impact:
- Normal operation metric
- High counts may indicate power management issues
- Compare against rated cycles (typically 600k-1M)
Recommended Actions:
1. Review power management settings
2. Monitor rate of increase
3. Plan replacement near rated limit
""",
'Wear_Leveling_Count': """
SSD block erase distribution metric.
Impact:
- Indicates wear pattern uniformity
- Higher values show more balanced wear
- Critical for SSD longevity
Recommended Actions:
1. Monitor trend over time
2. Compare with similar drives
3. Check workload distribution
"""
}
SEVERITY_INDICATORS = {
'CRITICAL': '🔴',
'WARNING': '🟡',
'HEALTHY': '🟢',
'UNKNOWN': ''
}
def _get_issue_type(self, issue: str) -> str:
if "SMART" in issue:
return "SMART Health Issue"
elif "Drive" in issue:
return "Storage Issue"
elif "ECC" in issue:
return "Memory Issue"
elif "CPU" in issue:
return "Performance Issue"
elif "Network" in issue:
return "Network Issue"
return "Hardware Issue"
def _get_impact_level(self, issue: str) -> str:
if "CRITICAL" in issue or "UNHEALTHY" in issue:
return "🔴 Critical - Immediate Action Required"
elif "WARNING" in issue:
return "🟡 Warning - Action Needed Soon"
return "🟢 Low - Monitor Only"
{'' * content_width}"""
executive_summary = f"""
┏━ EXECUTIVE SUMMARY ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓