diff --git a/ceph_osd_analyzer.py b/ceph_osd_analyzer.py index a3e93a2..4790200 100644 --- a/ceph_osd_analyzer.py +++ b/ceph_osd_analyzer.py @@ -235,12 +235,18 @@ def parse_smart_health(smart_data): value = attr.get('value', 0) raw_value = attr.get('raw', {}).get('value', 0) - # Reallocated Sectors (5) - CRITICAL indicator + # Reallocated Sectors (5) - CRITICAL indicator of imminent failure if attr_id == 5: metrics['reallocated_sectors'] = raw_value if raw_value > 0: - score -= min(50, raw_value * 5) # Much more aggressive - issues.append(f"CRITICAL: Reallocated sectors: {raw_value}") + # ANY reallocated sectors is a severe problem + if raw_value >= 10: + score -= 95 # Drive is failing, near-zero health + elif raw_value >= 5: + score -= 85 # Critical failure imminent + else: + score -= 70 # Even 1-4 sectors is very serious + issues.append(f"CRITICAL: Reallocated sectors: {raw_value} - DRIVE FAILING") # Spin Retry Count (10) - CRITICAL elif attr_id == 10: @@ -480,6 +486,8 @@ def analyze_cluster(): # Calculate total score with revised weights # Priority: Failed drives > Small failing drives > Small drives > Any failing has_health_issues = len(health_issues) > 0 + has_critical_issues = any('CRITICAL:' in issue and ('Reallocated' in issue or 'Uncorrectable' in issue or 'Pending' in issue) + for issue in health_issues) is_small = osd_df_data.get('crush_weight', 0) < 5 # Base scoring: 80% health, 15% capacity, 5% resilience @@ -495,6 +503,11 @@ def analyze_cluster(): base_score += 30 # Failed SMART + small = top priority else: base_score += 20 # Failed SMART alone is still critical + elif has_critical_issues: # Reallocated/pending/uncorrectable sectors + if is_small: + base_score += 25 # Critical issues + small drive + else: + base_score += 20 # Critical issues alone elif has_health_issues and is_small: base_score += 15 # Small + beginning to fail