Fix P1 escalation false positives and Ceph title spacing

- Exclude manufacturer operation counters (Seek_Error_Rate,
  Command_Timeout, Raw_Read_Error_Rate) from critical issue
  count to prevent false P1 escalation

- Fix missing space after [ceph] tag in ticket titles
  Before: [hostname][auto][ceph]Ceph HEALTH_WARN
  After:  [hostname][auto][ceph] Ceph HEALTH_WARN

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-17 15:59:58 -05:00
parent 6d959eff02
commit 1e84144e29

View File

@@ -1311,12 +1311,21 @@ class SystemHealthMonitor:
"""Count total critical issues across all health checks for P1 escalation."""
count = 0
# Manufacturer operation counters to exclude (same as in _detect_issues)
manufacturer_counters = [
'Seek_Error_Rate', 'Command_Timeout', 'Raw_Read_Error_Rate'
]
# Count drive failures
for drive in health_report.get('drives_health', {}).get('drives', []):
if drive.get('smart_status') == 'UNHEALTHY':
count += 1
if any('critical' in issue.lower() for issue in drive.get('smart_issues', [])):
count += 1
# Only count critical issues that aren't manufacturer operation counters
for issue in drive.get('smart_issues', []):
if 'critical' in issue.lower():
# Skip manufacturer operation counters
if not any(counter in issue for counter in manufacturer_counters):
count += 1
# Count ECC errors
if health_report.get('memory_health', {}).get('status') == 'CRITICAL':
@@ -1571,11 +1580,13 @@ class SystemHealthMonitor:
logger.warning(f"Could not extract device from issue: {issue}")
# Build ticket title with proper categorization
# Add space after issue_tag if drive_size is empty (for non-drive issues)
issue_separator = drive_size if drive_size else " "
ticket_title = (
f"[{hostname}]"
f"{action_type['AUTO']}"
f"{issue_tag}"
f"{drive_size}"
f"{issue_separator}"
f"{clean_issue}"
f"{scope}"
f"{environment['PRODUCTION']}"