Implement Phase C: Production Hardening
Implements phase-c-hardening.md to fix 6 RED resilience tests: - D1/D2: Health check timeout (1.5s) + parallel execution via join_all - C6: Recursive CAS calculate_size() to scan shard subdirectories - C7: FUSE read timeout (30s) returns EIO instead of hanging - 6.4: Auto-re-fetch corrupt/missing chunks from origin - 6.6: Passthrough mode - continue even when CAS write fails - C9: PID file with flock prevents concurrent mounts - 5.3: fd exhaustion handling test All 27 resilience tests now pass. Full test suite green. Files changed: - musicfs-origins/src/health.rs: timeout + join_all - musicfs-origins/Cargo.toml: add futures dependency - musicfs-cas/src/store.rs: recursive calculate_size - musicfs-cas/src/reader.rs: auto-re-fetch on IntegrityError/NotFound - musicfs-cas/src/fetcher.rs: passthrough fallback - musicfs-fuse/src/filesystem.rs: 30s read timeout - musicfs-cli/src/main.rs: PID file with flock - musicfs-test-utils/tests/resilience.rs: updated tests
This commit is contained in:
@@ -12,6 +12,7 @@ sftp = []
|
||||
musicfs-core = { path = "../musicfs-core" }
|
||||
async-trait.workspace = true
|
||||
dashmap.workspace = true
|
||||
futures.workspace = true
|
||||
libc.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["fs", "sync", "time"] }
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
use crate::traits::Origin;
|
||||
use dashmap::DashMap;
|
||||
use futures::future::join_all;
|
||||
use musicfs_core::{Event, EventBus, HealthStatus, OriginId, OriginType};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::{debug, info, info_span, Instrument};
|
||||
use tracing::{debug, info, info_span, warn, Instrument};
|
||||
|
||||
pub struct HealthMonitor {
|
||||
origins: DashMap<OriginId, Arc<dyn Origin>>,
|
||||
@@ -187,14 +188,30 @@ impl HealthMonitor {
|
||||
.map(|e| (e.key().clone(), e.value().clone()))
|
||||
.collect();
|
||||
|
||||
for (id, origin) in origins {
|
||||
self.check_one(&id, &origin).await;
|
||||
}
|
||||
let checks: Vec<_> = origins
|
||||
.iter()
|
||||
.map(|(id, origin)| self.check_one(id, origin))
|
||||
.collect();
|
||||
|
||||
join_all(checks).await;
|
||||
}
|
||||
|
||||
async fn check_one(&self, id: &OriginId, origin: &Arc<dyn Origin>) {
|
||||
let start = Instant::now();
|
||||
let status = origin.health().await;
|
||||
let health_timeout = Duration::from_millis(1500);
|
||||
|
||||
let status = match tokio::time::timeout(health_timeout, origin.health()).await {
|
||||
Ok(status) => status,
|
||||
Err(_) => {
|
||||
warn!(
|
||||
origin_id = %id,
|
||||
timeout_ms = health_timeout.as_millis() as u64,
|
||||
"Health check timed out"
|
||||
);
|
||||
HealthStatus::Unhealthy
|
||||
}
|
||||
};
|
||||
|
||||
let latency_ms = start.elapsed().as_millis() as u64;
|
||||
|
||||
let threshold = self.threshold_for(origin.origin_type());
|
||||
|
||||
Reference in New Issue
Block a user