Fix P1 escalation false positives and Ceph title spacing
- Exclude manufacturer operation counters (Seek_Error_Rate, Command_Timeout, Raw_Read_Error_Rate) from critical issue count to prevent false P1 escalation - Fix missing space after [ceph] tag in ticket titles Before: [hostname][auto][ceph]Ceph HEALTH_WARN After: [hostname][auto][ceph] Ceph HEALTH_WARN Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1311,11 +1311,20 @@ class SystemHealthMonitor:
|
||||
"""Count total critical issues across all health checks for P1 escalation."""
|
||||
count = 0
|
||||
|
||||
# Manufacturer operation counters to exclude (same as in _detect_issues)
|
||||
manufacturer_counters = [
|
||||
'Seek_Error_Rate', 'Command_Timeout', 'Raw_Read_Error_Rate'
|
||||
]
|
||||
|
||||
# Count drive failures
|
||||
for drive in health_report.get('drives_health', {}).get('drives', []):
|
||||
if drive.get('smart_status') == 'UNHEALTHY':
|
||||
count += 1
|
||||
if any('critical' in issue.lower() for issue in drive.get('smart_issues', [])):
|
||||
# Only count critical issues that aren't manufacturer operation counters
|
||||
for issue in drive.get('smart_issues', []):
|
||||
if 'critical' in issue.lower():
|
||||
# Skip manufacturer operation counters
|
||||
if not any(counter in issue for counter in manufacturer_counters):
|
||||
count += 1
|
||||
|
||||
# Count ECC errors
|
||||
@@ -1571,11 +1580,13 @@ class SystemHealthMonitor:
|
||||
logger.warning(f"Could not extract device from issue: {issue}")
|
||||
|
||||
# Build ticket title with proper categorization
|
||||
# Add space after issue_tag if drive_size is empty (for non-drive issues)
|
||||
issue_separator = drive_size if drive_size else " "
|
||||
ticket_title = (
|
||||
f"[{hostname}]"
|
||||
f"{action_type['AUTO']}"
|
||||
f"{issue_tag}"
|
||||
f"{drive_size}"
|
||||
f"{issue_separator}"
|
||||
f"{clean_issue}"
|
||||
f"{scope}"
|
||||
f"{environment['PRODUCTION']}"
|
||||
|
||||
Reference in New Issue
Block a user