From 35a16a1793fe15d6b48a6ef2dc6a7c1df84ec93a Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Tue, 6 Jan 2026 15:08:46 -0500 Subject: [PATCH] Fix reallocated sector scoring - drives with bad sectors now rank correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Problem**: osd.28 with 16 reallocated sectors only ranked #7 with score 40.8 This is a CRITICAL failing drive that should rank just below failed SMART reads. **Changes**: - Reallocated sectors now use tiered penalties: * 10+ sectors: -95 points (health = 5/100) - DRIVE FAILING * 5-9 sectors: -85 points (health = 15/100) - CRITICAL * 1-4 sectors: -70 points (health = 30/100) - SERIOUS - Added critical_issues detection for sector problems - Critical issues get +20 bonus (large) or +25 (small) in scoring - Updated issue text to "DRIVE FAILING" for clarity **Expected Result**: - osd.28 will now score ~96/100 and rank #7 (right after 6 failed SMART) - Any drive with reallocated/pending/uncorrectable sectors gets top priority - Matches priority: Failed SMART > Critical sectors > Small failing > Rest 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- ceph_osd_analyzer.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/ceph_osd_analyzer.py b/ceph_osd_analyzer.py index a3e93a2..4790200 100644 --- a/ceph_osd_analyzer.py +++ b/ceph_osd_analyzer.py @@ -235,12 +235,18 @@ def parse_smart_health(smart_data): value = attr.get('value', 0) raw_value = attr.get('raw', {}).get('value', 0) - # Reallocated Sectors (5) - CRITICAL indicator + # Reallocated Sectors (5) - CRITICAL indicator of imminent failure if attr_id == 5: metrics['reallocated_sectors'] = raw_value if raw_value > 0: - score -= min(50, raw_value * 5) # Much more aggressive - issues.append(f"CRITICAL: Reallocated sectors: {raw_value}") + # ANY reallocated sectors is a severe problem + if raw_value >= 10: + score -= 95 # Drive is failing, near-zero health + elif raw_value >= 5: + score -= 85 # Critical failure imminent + else: + score -= 70 # Even 1-4 sectors is very serious + issues.append(f"CRITICAL: Reallocated sectors: {raw_value} - DRIVE FAILING") # Spin Retry Count (10) - CRITICAL elif attr_id == 10: @@ -480,6 +486,8 @@ def analyze_cluster(): # Calculate total score with revised weights # Priority: Failed drives > Small failing drives > Small drives > Any failing has_health_issues = len(health_issues) > 0 + has_critical_issues = any('CRITICAL:' in issue and ('Reallocated' in issue or 'Uncorrectable' in issue or 'Pending' in issue) + for issue in health_issues) is_small = osd_df_data.get('crush_weight', 0) < 5 # Base scoring: 80% health, 15% capacity, 5% resilience @@ -495,6 +503,11 @@ def analyze_cluster(): base_score += 30 # Failed SMART + small = top priority else: base_score += 20 # Failed SMART alone is still critical + elif has_critical_issues: # Reallocated/pending/uncorrectable sectors + if is_small: + base_score += 25 # Critical issues + small drive + else: + base_score += 20 # Critical issues alone elif has_health_issues and is_small: base_score += 15 # Small + beginning to fail