Updated function scoping
This commit is contained in:
428
hwmonDaemon.py
428
hwmonDaemon.py
@ -76,6 +76,182 @@ class SystemHealthMonitor:
|
||||
'WD141KRYZ': ['02.01A02']
|
||||
}
|
||||
}
|
||||
SEVERITY_INDICATORS = {
|
||||
'CRITICAL': '🔴',
|
||||
'WARNING': '🟡',
|
||||
'HEALTHY': '🟢',
|
||||
'UNKNOWN': '⚪'
|
||||
}
|
||||
SMART_DESCRIPTIONS = {
|
||||
'Reported_Uncorrect': """
|
||||
Number of errors that could not be recovered using hardware ECC.
|
||||
Impact:
|
||||
- Indicates permanent data loss in affected sectors
|
||||
- High correlation with drive hardware failure
|
||||
- Critical reliability indicator
|
||||
|
||||
Recommended Actions:
|
||||
1. Backup critical data immediately
|
||||
2. Check drive logs for related errors
|
||||
3. Plan for drive replacement
|
||||
4. Monitor for error count increases
|
||||
""",
|
||||
|
||||
'Reallocated_Sector_Ct': """
|
||||
Number of sectors that have been reallocated due to errors.
|
||||
Impact:
|
||||
- High counts indicate degrading media
|
||||
- Each reallocation uses one of the drive's limited spare sectors
|
||||
- Rapid increases suggest accelerating drive wear
|
||||
|
||||
Recommended Actions:
|
||||
1. Monitor rate of increase
|
||||
2. Check drive temperature
|
||||
3. Plan replacement if count grows rapidly
|
||||
""",
|
||||
|
||||
'Current_Pending_Sector': """
|
||||
Sectors waiting to be reallocated due to read/write errors.
|
||||
Impact:
|
||||
- Indicates potentially unstable sectors
|
||||
- May result in data loss if unrecoverable
|
||||
- Should be monitored for increases
|
||||
|
||||
Recommended Actions:
|
||||
1. Backup affected files
|
||||
2. Run extended SMART tests
|
||||
3. Monitor for conversion to reallocated sectors
|
||||
""",
|
||||
|
||||
'Offline_Uncorrectable': """
|
||||
Count of uncorrectable errors detected during offline data collection.
|
||||
Impact:
|
||||
- Direct indicator of media reliability issues
|
||||
- May affect data integrity
|
||||
- High values suggest drive replacement needed
|
||||
|
||||
Recommended Actions:
|
||||
1. Run extended SMART tests
|
||||
2. Check drive logs
|
||||
3. Plan replacement if count is increasing
|
||||
""",
|
||||
|
||||
'Spin_Retry_Count': """
|
||||
Number of spin start retry attempts.
|
||||
Impact:
|
||||
- Indicates potential motor or bearing issues
|
||||
- May predict imminent mechanical failure
|
||||
- Increasing values suggest degrading drive health
|
||||
|
||||
Recommended Actions:
|
||||
1. Monitor for rapid increases
|
||||
2. Check drive temperature
|
||||
3. Plan replacement if count grows rapidly
|
||||
""",
|
||||
|
||||
'Power_On_Hours': """
|
||||
Total number of hours the device has been powered on.
|
||||
Impact:
|
||||
- Normal aging metric
|
||||
- Used to gauge overall drive lifetime
|
||||
- Compare against manufacturer's MTBF rating
|
||||
|
||||
Recommended Actions:
|
||||
1. Compare to warranty period
|
||||
2. Plan replacement if approaching rated lifetime
|
||||
""",
|
||||
|
||||
'Media_Wearout_Indicator': """
|
||||
Percentage of drive's rated life remaining (SSDs).
|
||||
Impact:
|
||||
- 100 indicates new drive
|
||||
- 0 indicates exceeded rated writes
|
||||
- Critical for SSD lifecycle management
|
||||
|
||||
Recommended Actions:
|
||||
1. Plan replacement below 20%
|
||||
2. Monitor write workload
|
||||
3. Consider workload redistribution
|
||||
""",
|
||||
|
||||
'Temperature_Celsius': """
|
||||
Current drive temperature.
|
||||
Impact:
|
||||
- High temperatures accelerate wear
|
||||
- Optimal range: 20-45°C
|
||||
- Sustained high temps reduce lifespan
|
||||
|
||||
Recommended Actions:
|
||||
1. Check system cooling
|
||||
2. Verify airflow
|
||||
3. Monitor for sustained high temperatures
|
||||
""",
|
||||
|
||||
'Available_Spare': """
|
||||
Percentage of spare blocks remaining (SSDs).
|
||||
Impact:
|
||||
- Critical for SSD endurance
|
||||
- Low values indicate approaching end-of-life
|
||||
- Rapid decreases suggest excessive writes
|
||||
|
||||
Recommended Actions:
|
||||
1. Plan replacement if below 20%
|
||||
2. Monitor write patterns
|
||||
3. Consider workload changes
|
||||
""",
|
||||
|
||||
'Program_Fail_Count': """
|
||||
Number of flash program operation failures.
|
||||
Impact:
|
||||
- Indicates NAND cell reliability
|
||||
- Important for SSD health assessment
|
||||
- Increasing values suggest flash degradation
|
||||
|
||||
Recommended Actions:
|
||||
1. Monitor rate of increase
|
||||
2. Check firmware updates
|
||||
3. Plan replacement if rapidly increasing
|
||||
""",
|
||||
|
||||
'Erase_Fail_Count': """
|
||||
Number of flash erase operation failures.
|
||||
Impact:
|
||||
- Related to NAND block health
|
||||
- Critical for SSD reliability
|
||||
- High counts suggest failing flash blocks
|
||||
|
||||
Recommended Actions:
|
||||
1. Monitor count increases
|
||||
2. Check firmware version
|
||||
3. Plan replacement if count is high
|
||||
""",
|
||||
|
||||
'Load_Cycle_Count': """
|
||||
Number of power cycles and head load/unload events.
|
||||
Impact:
|
||||
- Normal operation metric
|
||||
- High counts may indicate power management issues
|
||||
- Compare against rated cycles (typically 600k-1M)
|
||||
|
||||
Recommended Actions:
|
||||
1. Review power management settings
|
||||
2. Monitor rate of increase
|
||||
3. Plan replacement near rated limit
|
||||
""",
|
||||
|
||||
'Wear_Leveling_Count': """
|
||||
SSD block erase distribution metric.
|
||||
Impact:
|
||||
- Indicates wear pattern uniformity
|
||||
- Higher values show more balanced wear
|
||||
- Critical for SSD longevity
|
||||
|
||||
Recommended Actions:
|
||||
1. Monitor trend over time
|
||||
2. Compare with similar drives
|
||||
3. Check workload distribution
|
||||
"""
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
|
||||
@ -186,19 +362,52 @@ class SystemHealthMonitor:
|
||||
|
||||
return drive_details
|
||||
|
||||
STANDARD_WIDTH = 80
|
||||
|
||||
def make_box(title: str, content: str) -> str:
|
||||
return f"""
|
||||
┏━ {title} {'━' * (content_width - len(title) - 3)}┓
|
||||
{content}
|
||||
┗{'━' * content_width}┛"""
|
||||
|
||||
# Format each section using the consistent width
|
||||
sections = {
|
||||
'DRIVE SPECIFICATIONS': ...,
|
||||
'SMART STATUS': ...,
|
||||
'PARTITION INFO': ...
|
||||
}
|
||||
|
||||
# Each content line should pad to content_width
|
||||
for section, content in sections.items():
|
||||
formatted_content = '\n'.join(f"┃ {line:<{content_width-2}}┃" for line in content.split('\n'))
|
||||
description += make_box(section, formatted_content)
|
||||
|
||||
def _get_issue_type(self, issue: str) -> str:
|
||||
if "SMART" in issue:
|
||||
return "SMART Health Issue"
|
||||
elif "Drive" in issue:
|
||||
return "Storage Issue"
|
||||
elif "ECC" in issue:
|
||||
return "Memory Issue"
|
||||
elif "CPU" in issue:
|
||||
return "Performance Issue"
|
||||
elif "Network" in issue:
|
||||
return "Network Issue"
|
||||
return "Hardware Issue"
|
||||
|
||||
def _get_impact_level(self, issue: str) -> str:
|
||||
if "CRITICAL" in issue or "UNHEALTHY" in issue:
|
||||
return "🔴 Critical - Immediate Action Required"
|
||||
elif "WARNING" in issue:
|
||||
return "🟡 Warning - Action Needed Soon"
|
||||
return "🟢 Low - Monitor Only"
|
||||
|
||||
def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any]) -> str:
|
||||
hostname = socket.gethostname()
|
||||
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
priority = "⚠ HIGH" if "CRITICAL" in issue else "● MEDIUM"
|
||||
|
||||
# Calculate maximum width based on content
|
||||
content_width = max(
|
||||
len(hostname),
|
||||
len(timestamp),
|
||||
len(priority),
|
||||
len("HARDWARE MONITORING ALERT TICKET")
|
||||
) + 10 # Add padding
|
||||
|
||||
content_width = STANDARD_WIDTH - 2
|
||||
banner = f"""
|
||||
┏{'━' * content_width}┓
|
||||
┃{' HARDWARE MONITORING ALERT TICKET '.center(content_width)}┃
|
||||
@ -206,208 +415,7 @@ class SystemHealthMonitor:
|
||||
┃ Host : {hostname:<{content_width-13}}┃
|
||||
┃ Generated : {timestamp:<{content_width-13}}┃
|
||||
┃ Priority : {priority:<{content_width-13}}┃
|
||||
┗{'━' * content_width}┛
|
||||
"""
|
||||
|
||||
description = banner + "\n" + "┏━ ISSUE SUMMARY " + "━" * 50 + "\n" + issue + "\n\n"
|
||||
|
||||
# Add SMART attribute explanations
|
||||
SMART_DESCRIPTIONS = {
|
||||
'Reported_Uncorrect': """
|
||||
Number of errors that could not be recovered using hardware ECC.
|
||||
Impact:
|
||||
- Indicates permanent data loss in affected sectors
|
||||
- High correlation with drive hardware failure
|
||||
- Critical reliability indicator
|
||||
|
||||
Recommended Actions:
|
||||
1. Backup critical data immediately
|
||||
2. Check drive logs for related errors
|
||||
3. Plan for drive replacement
|
||||
4. Monitor for error count increases
|
||||
""",
|
||||
|
||||
'Reallocated_Sector_Ct': """
|
||||
Number of sectors that have been reallocated due to errors.
|
||||
Impact:
|
||||
- High counts indicate degrading media
|
||||
- Each reallocation uses one of the drive's limited spare sectors
|
||||
- Rapid increases suggest accelerating drive wear
|
||||
|
||||
Recommended Actions:
|
||||
1. Monitor rate of increase
|
||||
2. Check drive temperature
|
||||
3. Plan replacement if count grows rapidly
|
||||
""",
|
||||
|
||||
'Current_Pending_Sector': """
|
||||
Sectors waiting to be reallocated due to read/write errors.
|
||||
Impact:
|
||||
- Indicates potentially unstable sectors
|
||||
- May result in data loss if unrecoverable
|
||||
- Should be monitored for increases
|
||||
|
||||
Recommended Actions:
|
||||
1. Backup affected files
|
||||
2. Run extended SMART tests
|
||||
3. Monitor for conversion to reallocated sectors
|
||||
""",
|
||||
|
||||
'Offline_Uncorrectable': """
|
||||
Count of uncorrectable errors detected during offline data collection.
|
||||
Impact:
|
||||
- Direct indicator of media reliability issues
|
||||
- May affect data integrity
|
||||
- High values suggest drive replacement needed
|
||||
|
||||
Recommended Actions:
|
||||
1. Run extended SMART tests
|
||||
2. Check drive logs
|
||||
3. Plan replacement if count is increasing
|
||||
""",
|
||||
|
||||
'Spin_Retry_Count': """
|
||||
Number of spin start retry attempts.
|
||||
Impact:
|
||||
- Indicates potential motor or bearing issues
|
||||
- May predict imminent mechanical failure
|
||||
- Increasing values suggest degrading drive health
|
||||
|
||||
Recommended Actions:
|
||||
1. Monitor for rapid increases
|
||||
2. Check drive temperature
|
||||
3. Plan replacement if count grows rapidly
|
||||
""",
|
||||
|
||||
'Power_On_Hours': """
|
||||
Total number of hours the device has been powered on.
|
||||
Impact:
|
||||
- Normal aging metric
|
||||
- Used to gauge overall drive lifetime
|
||||
- Compare against manufacturer's MTBF rating
|
||||
|
||||
Recommended Actions:
|
||||
1. Compare to warranty period
|
||||
2. Plan replacement if approaching rated lifetime
|
||||
""",
|
||||
|
||||
'Media_Wearout_Indicator': """
|
||||
Percentage of drive's rated life remaining (SSDs).
|
||||
Impact:
|
||||
- 100 indicates new drive
|
||||
- 0 indicates exceeded rated writes
|
||||
- Critical for SSD lifecycle management
|
||||
|
||||
Recommended Actions:
|
||||
1. Plan replacement below 20%
|
||||
2. Monitor write workload
|
||||
3. Consider workload redistribution
|
||||
""",
|
||||
|
||||
'Temperature_Celsius': """
|
||||
Current drive temperature.
|
||||
Impact:
|
||||
- High temperatures accelerate wear
|
||||
- Optimal range: 20-45°C
|
||||
- Sustained high temps reduce lifespan
|
||||
|
||||
Recommended Actions:
|
||||
1. Check system cooling
|
||||
2. Verify airflow
|
||||
3. Monitor for sustained high temperatures
|
||||
""",
|
||||
|
||||
'Available_Spare': """
|
||||
Percentage of spare blocks remaining (SSDs).
|
||||
Impact:
|
||||
- Critical for SSD endurance
|
||||
- Low values indicate approaching end-of-life
|
||||
- Rapid decreases suggest excessive writes
|
||||
|
||||
Recommended Actions:
|
||||
1. Plan replacement if below 20%
|
||||
2. Monitor write patterns
|
||||
3. Consider workload changes
|
||||
""",
|
||||
|
||||
'Program_Fail_Count': """
|
||||
Number of flash program operation failures.
|
||||
Impact:
|
||||
- Indicates NAND cell reliability
|
||||
- Important for SSD health assessment
|
||||
- Increasing values suggest flash degradation
|
||||
|
||||
Recommended Actions:
|
||||
1. Monitor rate of increase
|
||||
2. Check firmware updates
|
||||
3. Plan replacement if rapidly increasing
|
||||
""",
|
||||
|
||||
'Erase_Fail_Count': """
|
||||
Number of flash erase operation failures.
|
||||
Impact:
|
||||
- Related to NAND block health
|
||||
- Critical for SSD reliability
|
||||
- High counts suggest failing flash blocks
|
||||
|
||||
Recommended Actions:
|
||||
1. Monitor count increases
|
||||
2. Check firmware version
|
||||
3. Plan replacement if count is high
|
||||
""",
|
||||
|
||||
'Load_Cycle_Count': """
|
||||
Number of power cycles and head load/unload events.
|
||||
Impact:
|
||||
- Normal operation metric
|
||||
- High counts may indicate power management issues
|
||||
- Compare against rated cycles (typically 600k-1M)
|
||||
|
||||
Recommended Actions:
|
||||
1. Review power management settings
|
||||
2. Monitor rate of increase
|
||||
3. Plan replacement near rated limit
|
||||
""",
|
||||
|
||||
'Wear_Leveling_Count': """
|
||||
SSD block erase distribution metric.
|
||||
Impact:
|
||||
- Indicates wear pattern uniformity
|
||||
- Higher values show more balanced wear
|
||||
- Critical for SSD longevity
|
||||
|
||||
Recommended Actions:
|
||||
1. Monitor trend over time
|
||||
2. Compare with similar drives
|
||||
3. Check workload distribution
|
||||
"""
|
||||
}
|
||||
SEVERITY_INDICATORS = {
|
||||
'CRITICAL': '🔴',
|
||||
'WARNING': '🟡',
|
||||
'HEALTHY': '🟢',
|
||||
'UNKNOWN': '⚪'
|
||||
}
|
||||
|
||||
def _get_issue_type(self, issue: str) -> str:
|
||||
if "SMART" in issue:
|
||||
return "SMART Health Issue"
|
||||
elif "Drive" in issue:
|
||||
return "Storage Issue"
|
||||
elif "ECC" in issue:
|
||||
return "Memory Issue"
|
||||
elif "CPU" in issue:
|
||||
return "Performance Issue"
|
||||
elif "Network" in issue:
|
||||
return "Network Issue"
|
||||
return "Hardware Issue"
|
||||
|
||||
def _get_impact_level(self, issue: str) -> str:
|
||||
if "CRITICAL" in issue or "UNHEALTHY" in issue:
|
||||
return "🔴 Critical - Immediate Action Required"
|
||||
elif "WARNING" in issue:
|
||||
return "🟡 Warning - Action Needed Soon"
|
||||
return "🟢 Low - Monitor Only"
|
||||
┗{'━' * content_width}┛"""
|
||||
|
||||
executive_summary = f"""
|
||||
┏━ EXECUTIVE SUMMARY ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||
|
||||
Reference in New Issue
Block a user