From 35a16a1793fe15d6b48a6ef2dc6a7c1df84ec93a Mon Sep 17 00:00:00 2001
From: Jared Vititoe <jjvititoe1@gmail.com>
Date: Tue, 6 Jan 2026 15:08:46 -0500
Subject: [PATCH] Fix reallocated sector scoring - drives with bad sectors now
 rank correctly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**Problem**: osd.28 with 16 reallocated sectors only ranked #7 with score 40.8
This is a CRITICAL failing drive that should rank just below failed SMART reads.

**Changes**:
- Reallocated sectors now use tiered penalties:
  * 10+ sectors: -95 points (health = 5/100) - DRIVE FAILING
  * 5-9 sectors: -85 points (health = 15/100) - CRITICAL
  * 1-4 sectors: -70 points (health = 30/100) - SERIOUS
- Added critical_issues detection for sector problems
- Critical issues get +20 bonus (large) or +25 (small) in scoring
- Updated issue text to "DRIVE FAILING" for clarity

**Expected Result**:
- osd.28 will now score ~96/100 and rank #7 (right after 6 failed SMART)
- Any drive with reallocated/pending/uncorrectable sectors gets top priority
- Matches priority: Failed SMART > Critical sectors > Small failing > Rest

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 ceph_osd_analyzer.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/ceph_osd_analyzer.py b/ceph_osd_analyzer.py
index a3e93a2..4790200 100644
--- a/ceph_osd_analyzer.py
+++ b/ceph_osd_analyzer.py
@@ -235,12 +235,18 @@ def parse_smart_health(smart_data):
             value = attr.get('value', 0)
             raw_value = attr.get('raw', {}).get('value', 0)
             
-            # Reallocated Sectors (5) - CRITICAL indicator
+            # Reallocated Sectors (5) - CRITICAL indicator of imminent failure
             if attr_id == 5:
                 metrics['reallocated_sectors'] = raw_value
                 if raw_value > 0:
-                    score -= min(50, raw_value * 5)  # Much more aggressive
-                    issues.append(f"CRITICAL: Reallocated sectors: {raw_value}")
+                    # ANY reallocated sectors is a severe problem
+                    if raw_value >= 10:
+                        score -= 95  # Drive is failing, near-zero health
+                    elif raw_value >= 5:
+                        score -= 85  # Critical failure imminent
+                    else:
+                        score -= 70  # Even 1-4 sectors is very serious
+                    issues.append(f"CRITICAL: Reallocated sectors: {raw_value} - DRIVE FAILING")
             
             # Spin Retry Count (10) - CRITICAL
             elif attr_id == 10:
@@ -480,6 +486,8 @@ def analyze_cluster():
         # Calculate total score with revised weights
         # Priority: Failed drives > Small failing drives > Small drives > Any failing
         has_health_issues = len(health_issues) > 0
+        has_critical_issues = any('CRITICAL:' in issue and ('Reallocated' in issue or 'Uncorrectable' in issue or 'Pending' in issue)
+                                  for issue in health_issues)
         is_small = osd_df_data.get('crush_weight', 0) < 5
 
         # Base scoring: 80% health, 15% capacity, 5% resilience
@@ -495,6 +503,11 @@ def analyze_cluster():
                 base_score += 30  # Failed SMART + small = top priority
             else:
                 base_score += 20  # Failed SMART alone is still critical
+        elif has_critical_issues:  # Reallocated/pending/uncorrectable sectors
+            if is_small:
+                base_score += 25  # Critical issues + small drive
+            else:
+                base_score += 20  # Critical issues alone
         elif has_health_issues and is_small:
             base_score += 15  # Small + beginning to fail