Implement Phase C: Production Hardening

Implements phase-c-hardening.md to fix 6 RED resilience tests:

- D1/D2: Health check timeout (1.5s) + parallel execution via join_all
- C6: Recursive CAS calculate_size() to scan shard subdirectories
- C7: FUSE read timeout (30s) returns EIO instead of hanging
- 6.4: Auto-re-fetch corrupt/missing chunks from origin
- 6.6: Passthrough mode - continue even when CAS write fails
- C9: PID file with flock prevents concurrent mounts
- 5.3: fd exhaustion handling test

All 27 resilience tests now pass. Full test suite green.

Files changed:
- musicfs-origins/src/health.rs: timeout + join_all
- musicfs-origins/Cargo.toml: add futures dependency
- musicfs-cas/src/store.rs: recursive calculate_size
- musicfs-cas/src/reader.rs: auto-re-fetch on IntegrityError/NotFound
- musicfs-cas/src/fetcher.rs: passthrough fallback
- musicfs-fuse/src/filesystem.rs: 30s read timeout
- musicfs-cli/src/main.rs: PID file with flock
- musicfs-test-utils/tests/resilience.rs: updated tests
This commit is contained in:
Alexander
2026-05-13 15:55:22 +02:00
parent 3038c94b8c
commit 0ff2a17ab7
11 changed files with 325 additions and 39 deletions
@@ -12,6 +12,7 @@ sftp = []
musicfs-core = { path = "../musicfs-core" }
async-trait.workspace = true
dashmap.workspace = true
futures.workspace = true
libc.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["fs", "sync", "time"] }
+22 -5
View File
@@ -1,11 +1,12 @@
use crate::traits::Origin;
use dashmap::DashMap;
use futures::future::join_all;
use musicfs_core::{Event, EventBus, HealthStatus, OriginId, OriginType};
use std::collections::HashMap;
use std::sync::Arc;
use std::time::{Duration, Instant};
use tokio::sync::mpsc;
use tracing::{debug, info, info_span, Instrument};
use tracing::{debug, info, info_span, warn, Instrument};
pub struct HealthMonitor {
origins: DashMap<OriginId, Arc<dyn Origin>>,
@@ -187,14 +188,30 @@ impl HealthMonitor {
.map(|e| (e.key().clone(), e.value().clone()))
.collect();
for (id, origin) in origins {
self.check_one(&id, &origin).await;
}
let checks: Vec<_> = origins
.iter()
.map(|(id, origin)| self.check_one(id, origin))
.collect();
join_all(checks).await;
}
async fn check_one(&self, id: &OriginId, origin: &Arc<dyn Origin>) {
let start = Instant::now();
let status = origin.health().await;
let health_timeout = Duration::from_millis(1500);
let status = match tokio::time::timeout(health_timeout, origin.health()).await {
Ok(status) => status,
Err(_) => {
warn!(
origin_id = %id,
timeout_ms = health_timeout.as_millis() as u64,
"Health check timed out"
);
HealthStatus::Unhealthy
}
};
let latency_ms = start.elapsed().as_millis() as u64;
let threshold = self.threshold_for(origin.origin_type());