Updated function scoping
This commit is contained in:
428
hwmonDaemon.py
428
hwmonDaemon.py
@ -76,6 +76,182 @@ class SystemHealthMonitor:
|
|||||||
'WD141KRYZ': ['02.01A02']
|
'WD141KRYZ': ['02.01A02']
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
SEVERITY_INDICATORS = {
|
||||||
|
'CRITICAL': '🔴',
|
||||||
|
'WARNING': '🟡',
|
||||||
|
'HEALTHY': '🟢',
|
||||||
|
'UNKNOWN': '⚪'
|
||||||
|
}
|
||||||
|
SMART_DESCRIPTIONS = {
|
||||||
|
'Reported_Uncorrect': """
|
||||||
|
Number of errors that could not be recovered using hardware ECC.
|
||||||
|
Impact:
|
||||||
|
- Indicates permanent data loss in affected sectors
|
||||||
|
- High correlation with drive hardware failure
|
||||||
|
- Critical reliability indicator
|
||||||
|
|
||||||
|
Recommended Actions:
|
||||||
|
1. Backup critical data immediately
|
||||||
|
2. Check drive logs for related errors
|
||||||
|
3. Plan for drive replacement
|
||||||
|
4. Monitor for error count increases
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Reallocated_Sector_Ct': """
|
||||||
|
Number of sectors that have been reallocated due to errors.
|
||||||
|
Impact:
|
||||||
|
- High counts indicate degrading media
|
||||||
|
- Each reallocation uses one of the drive's limited spare sectors
|
||||||
|
- Rapid increases suggest accelerating drive wear
|
||||||
|
|
||||||
|
Recommended Actions:
|
||||||
|
1. Monitor rate of increase
|
||||||
|
2. Check drive temperature
|
||||||
|
3. Plan replacement if count grows rapidly
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Current_Pending_Sector': """
|
||||||
|
Sectors waiting to be reallocated due to read/write errors.
|
||||||
|
Impact:
|
||||||
|
- Indicates potentially unstable sectors
|
||||||
|
- May result in data loss if unrecoverable
|
||||||
|
- Should be monitored for increases
|
||||||
|
|
||||||
|
Recommended Actions:
|
||||||
|
1. Backup affected files
|
||||||
|
2. Run extended SMART tests
|
||||||
|
3. Monitor for conversion to reallocated sectors
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Offline_Uncorrectable': """
|
||||||
|
Count of uncorrectable errors detected during offline data collection.
|
||||||
|
Impact:
|
||||||
|
- Direct indicator of media reliability issues
|
||||||
|
- May affect data integrity
|
||||||
|
- High values suggest drive replacement needed
|
||||||
|
|
||||||
|
Recommended Actions:
|
||||||
|
1. Run extended SMART tests
|
||||||
|
2. Check drive logs
|
||||||
|
3. Plan replacement if count is increasing
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Spin_Retry_Count': """
|
||||||
|
Number of spin start retry attempts.
|
||||||
|
Impact:
|
||||||
|
- Indicates potential motor or bearing issues
|
||||||
|
- May predict imminent mechanical failure
|
||||||
|
- Increasing values suggest degrading drive health
|
||||||
|
|
||||||
|
Recommended Actions:
|
||||||
|
1. Monitor for rapid increases
|
||||||
|
2. Check drive temperature
|
||||||
|
3. Plan replacement if count grows rapidly
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Power_On_Hours': """
|
||||||
|
Total number of hours the device has been powered on.
|
||||||
|
Impact:
|
||||||
|
- Normal aging metric
|
||||||
|
- Used to gauge overall drive lifetime
|
||||||
|
- Compare against manufacturer's MTBF rating
|
||||||
|
|
||||||
|
Recommended Actions:
|
||||||
|
1. Compare to warranty period
|
||||||
|
2. Plan replacement if approaching rated lifetime
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Media_Wearout_Indicator': """
|
||||||
|
Percentage of drive's rated life remaining (SSDs).
|
||||||
|
Impact:
|
||||||
|
- 100 indicates new drive
|
||||||
|
- 0 indicates exceeded rated writes
|
||||||
|
- Critical for SSD lifecycle management
|
||||||
|
|
||||||
|
Recommended Actions:
|
||||||
|
1. Plan replacement below 20%
|
||||||
|
2. Monitor write workload
|
||||||
|
3. Consider workload redistribution
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Temperature_Celsius': """
|
||||||
|
Current drive temperature.
|
||||||
|
Impact:
|
||||||
|
- High temperatures accelerate wear
|
||||||
|
- Optimal range: 20-45°C
|
||||||
|
- Sustained high temps reduce lifespan
|
||||||
|
|
||||||
|
Recommended Actions:
|
||||||
|
1. Check system cooling
|
||||||
|
2. Verify airflow
|
||||||
|
3. Monitor for sustained high temperatures
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Available_Spare': """
|
||||||
|
Percentage of spare blocks remaining (SSDs).
|
||||||
|
Impact:
|
||||||
|
- Critical for SSD endurance
|
||||||
|
- Low values indicate approaching end-of-life
|
||||||
|
- Rapid decreases suggest excessive writes
|
||||||
|
|
||||||
|
Recommended Actions:
|
||||||
|
1. Plan replacement if below 20%
|
||||||
|
2. Monitor write patterns
|
||||||
|
3. Consider workload changes
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Program_Fail_Count': """
|
||||||
|
Number of flash program operation failures.
|
||||||
|
Impact:
|
||||||
|
- Indicates NAND cell reliability
|
||||||
|
- Important for SSD health assessment
|
||||||
|
- Increasing values suggest flash degradation
|
||||||
|
|
||||||
|
Recommended Actions:
|
||||||
|
1. Monitor rate of increase
|
||||||
|
2. Check firmware updates
|
||||||
|
3. Plan replacement if rapidly increasing
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Erase_Fail_Count': """
|
||||||
|
Number of flash erase operation failures.
|
||||||
|
Impact:
|
||||||
|
- Related to NAND block health
|
||||||
|
- Critical for SSD reliability
|
||||||
|
- High counts suggest failing flash blocks
|
||||||
|
|
||||||
|
Recommended Actions:
|
||||||
|
1. Monitor count increases
|
||||||
|
2. Check firmware version
|
||||||
|
3. Plan replacement if count is high
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Load_Cycle_Count': """
|
||||||
|
Number of power cycles and head load/unload events.
|
||||||
|
Impact:
|
||||||
|
- Normal operation metric
|
||||||
|
- High counts may indicate power management issues
|
||||||
|
- Compare against rated cycles (typically 600k-1M)
|
||||||
|
|
||||||
|
Recommended Actions:
|
||||||
|
1. Review power management settings
|
||||||
|
2. Monitor rate of increase
|
||||||
|
3. Plan replacement near rated limit
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Wear_Leveling_Count': """
|
||||||
|
SSD block erase distribution metric.
|
||||||
|
Impact:
|
||||||
|
- Indicates wear pattern uniformity
|
||||||
|
- Higher values show more balanced wear
|
||||||
|
- Critical for SSD longevity
|
||||||
|
|
||||||
|
Recommended Actions:
|
||||||
|
1. Monitor trend over time
|
||||||
|
2. Compare with similar drives
|
||||||
|
3. Check workload distribution
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
|
ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
|
||||||
@ -186,19 +362,52 @@ class SystemHealthMonitor:
|
|||||||
|
|
||||||
return drive_details
|
return drive_details
|
||||||
|
|
||||||
|
STANDARD_WIDTH = 80
|
||||||
|
|
||||||
|
def make_box(title: str, content: str) -> str:
|
||||||
|
return f"""
|
||||||
|
┏━ {title} {'━' * (content_width - len(title) - 3)}┓
|
||||||
|
{content}
|
||||||
|
┗{'━' * content_width}┛"""
|
||||||
|
|
||||||
|
# Format each section using the consistent width
|
||||||
|
sections = {
|
||||||
|
'DRIVE SPECIFICATIONS': ...,
|
||||||
|
'SMART STATUS': ...,
|
||||||
|
'PARTITION INFO': ...
|
||||||
|
}
|
||||||
|
|
||||||
|
# Each content line should pad to content_width
|
||||||
|
for section, content in sections.items():
|
||||||
|
formatted_content = '\n'.join(f"┃ {line:<{content_width-2}}┃" for line in content.split('\n'))
|
||||||
|
description += make_box(section, formatted_content)
|
||||||
|
|
||||||
|
def _get_issue_type(self, issue: str) -> str:
|
||||||
|
if "SMART" in issue:
|
||||||
|
return "SMART Health Issue"
|
||||||
|
elif "Drive" in issue:
|
||||||
|
return "Storage Issue"
|
||||||
|
elif "ECC" in issue:
|
||||||
|
return "Memory Issue"
|
||||||
|
elif "CPU" in issue:
|
||||||
|
return "Performance Issue"
|
||||||
|
elif "Network" in issue:
|
||||||
|
return "Network Issue"
|
||||||
|
return "Hardware Issue"
|
||||||
|
|
||||||
|
def _get_impact_level(self, issue: str) -> str:
|
||||||
|
if "CRITICAL" in issue or "UNHEALTHY" in issue:
|
||||||
|
return "🔴 Critical - Immediate Action Required"
|
||||||
|
elif "WARNING" in issue:
|
||||||
|
return "🟡 Warning - Action Needed Soon"
|
||||||
|
return "🟢 Low - Monitor Only"
|
||||||
|
|
||||||
def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any]) -> str:
|
def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any]) -> str:
|
||||||
hostname = socket.gethostname()
|
hostname = socket.gethostname()
|
||||||
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
priority = "⚠ HIGH" if "CRITICAL" in issue else "● MEDIUM"
|
priority = "⚠ HIGH" if "CRITICAL" in issue else "● MEDIUM"
|
||||||
|
|
||||||
# Calculate maximum width based on content
|
content_width = STANDARD_WIDTH - 2
|
||||||
content_width = max(
|
|
||||||
len(hostname),
|
|
||||||
len(timestamp),
|
|
||||||
len(priority),
|
|
||||||
len("HARDWARE MONITORING ALERT TICKET")
|
|
||||||
) + 10 # Add padding
|
|
||||||
|
|
||||||
banner = f"""
|
banner = f"""
|
||||||
┏{'━' * content_width}┓
|
┏{'━' * content_width}┓
|
||||||
┃{' HARDWARE MONITORING ALERT TICKET '.center(content_width)}┃
|
┃{' HARDWARE MONITORING ALERT TICKET '.center(content_width)}┃
|
||||||
@ -206,208 +415,7 @@ class SystemHealthMonitor:
|
|||||||
┃ Host : {hostname:<{content_width-13}}┃
|
┃ Host : {hostname:<{content_width-13}}┃
|
||||||
┃ Generated : {timestamp:<{content_width-13}}┃
|
┃ Generated : {timestamp:<{content_width-13}}┃
|
||||||
┃ Priority : {priority:<{content_width-13}}┃
|
┃ Priority : {priority:<{content_width-13}}┃
|
||||||
┗{'━' * content_width}┛
|
┗{'━' * content_width}┛"""
|
||||||
"""
|
|
||||||
|
|
||||||
description = banner + "\n" + "┏━ ISSUE SUMMARY " + "━" * 50 + "\n" + issue + "\n\n"
|
|
||||||
|
|
||||||
# Add SMART attribute explanations
|
|
||||||
SMART_DESCRIPTIONS = {
|
|
||||||
'Reported_Uncorrect': """
|
|
||||||
Number of errors that could not be recovered using hardware ECC.
|
|
||||||
Impact:
|
|
||||||
- Indicates permanent data loss in affected sectors
|
|
||||||
- High correlation with drive hardware failure
|
|
||||||
- Critical reliability indicator
|
|
||||||
|
|
||||||
Recommended Actions:
|
|
||||||
1. Backup critical data immediately
|
|
||||||
2. Check drive logs for related errors
|
|
||||||
3. Plan for drive replacement
|
|
||||||
4. Monitor for error count increases
|
|
||||||
""",
|
|
||||||
|
|
||||||
'Reallocated_Sector_Ct': """
|
|
||||||
Number of sectors that have been reallocated due to errors.
|
|
||||||
Impact:
|
|
||||||
- High counts indicate degrading media
|
|
||||||
- Each reallocation uses one of the drive's limited spare sectors
|
|
||||||
- Rapid increases suggest accelerating drive wear
|
|
||||||
|
|
||||||
Recommended Actions:
|
|
||||||
1. Monitor rate of increase
|
|
||||||
2. Check drive temperature
|
|
||||||
3. Plan replacement if count grows rapidly
|
|
||||||
""",
|
|
||||||
|
|
||||||
'Current_Pending_Sector': """
|
|
||||||
Sectors waiting to be reallocated due to read/write errors.
|
|
||||||
Impact:
|
|
||||||
- Indicates potentially unstable sectors
|
|
||||||
- May result in data loss if unrecoverable
|
|
||||||
- Should be monitored for increases
|
|
||||||
|
|
||||||
Recommended Actions:
|
|
||||||
1. Backup affected files
|
|
||||||
2. Run extended SMART tests
|
|
||||||
3. Monitor for conversion to reallocated sectors
|
|
||||||
""",
|
|
||||||
|
|
||||||
'Offline_Uncorrectable': """
|
|
||||||
Count of uncorrectable errors detected during offline data collection.
|
|
||||||
Impact:
|
|
||||||
- Direct indicator of media reliability issues
|
|
||||||
- May affect data integrity
|
|
||||||
- High values suggest drive replacement needed
|
|
||||||
|
|
||||||
Recommended Actions:
|
|
||||||
1. Run extended SMART tests
|
|
||||||
2. Check drive logs
|
|
||||||
3. Plan replacement if count is increasing
|
|
||||||
""",
|
|
||||||
|
|
||||||
'Spin_Retry_Count': """
|
|
||||||
Number of spin start retry attempts.
|
|
||||||
Impact:
|
|
||||||
- Indicates potential motor or bearing issues
|
|
||||||
- May predict imminent mechanical failure
|
|
||||||
- Increasing values suggest degrading drive health
|
|
||||||
|
|
||||||
Recommended Actions:
|
|
||||||
1. Monitor for rapid increases
|
|
||||||
2. Check drive temperature
|
|
||||||
3. Plan replacement if count grows rapidly
|
|
||||||
""",
|
|
||||||
|
|
||||||
'Power_On_Hours': """
|
|
||||||
Total number of hours the device has been powered on.
|
|
||||||
Impact:
|
|
||||||
- Normal aging metric
|
|
||||||
- Used to gauge overall drive lifetime
|
|
||||||
- Compare against manufacturer's MTBF rating
|
|
||||||
|
|
||||||
Recommended Actions:
|
|
||||||
1. Compare to warranty period
|
|
||||||
2. Plan replacement if approaching rated lifetime
|
|
||||||
""",
|
|
||||||
|
|
||||||
'Media_Wearout_Indicator': """
|
|
||||||
Percentage of drive's rated life remaining (SSDs).
|
|
||||||
Impact:
|
|
||||||
- 100 indicates new drive
|
|
||||||
- 0 indicates exceeded rated writes
|
|
||||||
- Critical for SSD lifecycle management
|
|
||||||
|
|
||||||
Recommended Actions:
|
|
||||||
1. Plan replacement below 20%
|
|
||||||
2. Monitor write workload
|
|
||||||
3. Consider workload redistribution
|
|
||||||
""",
|
|
||||||
|
|
||||||
'Temperature_Celsius': """
|
|
||||||
Current drive temperature.
|
|
||||||
Impact:
|
|
||||||
- High temperatures accelerate wear
|
|
||||||
- Optimal range: 20-45°C
|
|
||||||
- Sustained high temps reduce lifespan
|
|
||||||
|
|
||||||
Recommended Actions:
|
|
||||||
1. Check system cooling
|
|
||||||
2. Verify airflow
|
|
||||||
3. Monitor for sustained high temperatures
|
|
||||||
""",
|
|
||||||
|
|
||||||
'Available_Spare': """
|
|
||||||
Percentage of spare blocks remaining (SSDs).
|
|
||||||
Impact:
|
|
||||||
- Critical for SSD endurance
|
|
||||||
- Low values indicate approaching end-of-life
|
|
||||||
- Rapid decreases suggest excessive writes
|
|
||||||
|
|
||||||
Recommended Actions:
|
|
||||||
1. Plan replacement if below 20%
|
|
||||||
2. Monitor write patterns
|
|
||||||
3. Consider workload changes
|
|
||||||
""",
|
|
||||||
|
|
||||||
'Program_Fail_Count': """
|
|
||||||
Number of flash program operation failures.
|
|
||||||
Impact:
|
|
||||||
- Indicates NAND cell reliability
|
|
||||||
- Important for SSD health assessment
|
|
||||||
- Increasing values suggest flash degradation
|
|
||||||
|
|
||||||
Recommended Actions:
|
|
||||||
1. Monitor rate of increase
|
|
||||||
2. Check firmware updates
|
|
||||||
3. Plan replacement if rapidly increasing
|
|
||||||
""",
|
|
||||||
|
|
||||||
'Erase_Fail_Count': """
|
|
||||||
Number of flash erase operation failures.
|
|
||||||
Impact:
|
|
||||||
- Related to NAND block health
|
|
||||||
- Critical for SSD reliability
|
|
||||||
- High counts suggest failing flash blocks
|
|
||||||
|
|
||||||
Recommended Actions:
|
|
||||||
1. Monitor count increases
|
|
||||||
2. Check firmware version
|
|
||||||
3. Plan replacement if count is high
|
|
||||||
""",
|
|
||||||
|
|
||||||
'Load_Cycle_Count': """
|
|
||||||
Number of power cycles and head load/unload events.
|
|
||||||
Impact:
|
|
||||||
- Normal operation metric
|
|
||||||
- High counts may indicate power management issues
|
|
||||||
- Compare against rated cycles (typically 600k-1M)
|
|
||||||
|
|
||||||
Recommended Actions:
|
|
||||||
1. Review power management settings
|
|
||||||
2. Monitor rate of increase
|
|
||||||
3. Plan replacement near rated limit
|
|
||||||
""",
|
|
||||||
|
|
||||||
'Wear_Leveling_Count': """
|
|
||||||
SSD block erase distribution metric.
|
|
||||||
Impact:
|
|
||||||
- Indicates wear pattern uniformity
|
|
||||||
- Higher values show more balanced wear
|
|
||||||
- Critical for SSD longevity
|
|
||||||
|
|
||||||
Recommended Actions:
|
|
||||||
1. Monitor trend over time
|
|
||||||
2. Compare with similar drives
|
|
||||||
3. Check workload distribution
|
|
||||||
"""
|
|
||||||
}
|
|
||||||
SEVERITY_INDICATORS = {
|
|
||||||
'CRITICAL': '🔴',
|
|
||||||
'WARNING': '🟡',
|
|
||||||
'HEALTHY': '🟢',
|
|
||||||
'UNKNOWN': '⚪'
|
|
||||||
}
|
|
||||||
|
|
||||||
def _get_issue_type(self, issue: str) -> str:
|
|
||||||
if "SMART" in issue:
|
|
||||||
return "SMART Health Issue"
|
|
||||||
elif "Drive" in issue:
|
|
||||||
return "Storage Issue"
|
|
||||||
elif "ECC" in issue:
|
|
||||||
return "Memory Issue"
|
|
||||||
elif "CPU" in issue:
|
|
||||||
return "Performance Issue"
|
|
||||||
elif "Network" in issue:
|
|
||||||
return "Network Issue"
|
|
||||||
return "Hardware Issue"
|
|
||||||
|
|
||||||
def _get_impact_level(self, issue: str) -> str:
|
|
||||||
if "CRITICAL" in issue or "UNHEALTHY" in issue:
|
|
||||||
return "🔴 Critical - Immediate Action Required"
|
|
||||||
elif "WARNING" in issue:
|
|
||||||
return "🟡 Warning - Action Needed Soon"
|
|
||||||
return "🟢 Low - Monitor Only"
|
|
||||||
|
|
||||||
executive_summary = f"""
|
executive_summary = f"""
|
||||||
┏━ EXECUTIVE SUMMARY ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
┏━ EXECUTIVE SUMMARY ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
||||||
|
|||||||
Reference in New Issue
Block a user