Implement Phase C: Production Hardening

Implements phase-c-hardening.md to fix 6 RED resilience tests:

- D1/D2: Health check timeout (1.5s) + parallel execution via join_all
- C6: Recursive CAS calculate_size() to scan shard subdirectories
- C7: FUSE read timeout (30s) returns EIO instead of hanging
- 6.4: Auto-re-fetch corrupt/missing chunks from origin
- 6.6: Passthrough mode - continue even when CAS write fails
- C9: PID file with flock prevents concurrent mounts
- 5.3: fd exhaustion handling test

All 27 resilience tests now pass. Full test suite green.

Files changed:
- musicfs-origins/src/health.rs: timeout + join_all
- musicfs-origins/Cargo.toml: add futures dependency
- musicfs-cas/src/store.rs: recursive calculate_size
- musicfs-cas/src/reader.rs: auto-re-fetch on IntegrityError/NotFound
- musicfs-cas/src/fetcher.rs: passthrough fallback
- musicfs-fuse/src/filesystem.rs: 30s read timeout
- musicfs-cli/src/main.rs: PID file with flock
- musicfs-test-utils/tests/resilience.rs: updated tests
This commit is contained in:
Alexander
2026-05-13 15:55:22 +02:00
parent 3038c94b8c
commit 0ff2a17ab7
11 changed files with 325 additions and 39 deletions
+41 -9
View File
@@ -7,6 +7,9 @@ use musicfs_fuse::MusicFs;
use musicfs_metadata::MetadataParser;
use musicfs_origins::{LocalOrigin, Origin};
use parking_lot::RwLock;
use std::fs::File;
use std::io::Write;
use std::os::unix::io::AsRawFd;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::SystemTime;
@@ -87,6 +90,29 @@ enum OriginCommands {
},
}
struct LockFile {
_file: File,
}
fn try_acquire_lock(path: &Path) -> Result<LockFile> {
let file = File::create(path).context("Failed to create lock file")?;
let fd = file.as_raw_fd();
let ret = unsafe { libc::flock(fd, libc::LOCK_EX | libc::LOCK_NB) };
if ret != 0 {
let err = std::io::Error::last_os_error();
if err.kind() == std::io::ErrorKind::WouldBlock {
anyhow::bail!("MusicFS is already running (lock file: {:?})", path);
}
return Err(err).context("Failed to acquire lock");
}
let mut f = &file;
writeln!(f, "{}", std::process::id())?;
Ok(LockFile { _file: file })
}
fn main() -> Result<()> {
musicfs_core::install_panic_hook();
let cli = Cli::parse();
@@ -139,24 +165,25 @@ fn run_mount(
) -> Result<()> {
let origin_path = origin_path.context("--origin is required for mount")?;
let cache_dir = cache_dir.unwrap_or_else(|| {
dirs::cache_dir()
.unwrap_or_else(|| PathBuf::from("/tmp"))
.join("musicfs")
});
let runtime = tokio::runtime::Runtime::new().context("Failed to create Tokio runtime")?;
let handle = runtime.handle().clone();
let cache_dir_clone = cache_dir.clone();
let (tree, reader) = runtime.block_on(async {
info!(origin = ?origin_path, mountpoint = ?mountpoint, "Mount configuration");
info!("Cache directory: {:?}", cache_dir_clone);
let cache_dir = cache_dir.unwrap_or_else(|| {
dirs::cache_dir()
.unwrap_or_else(|| PathBuf::from("/tmp"))
.join("musicfs")
});
info!("Cache directory: {:?}", cache_dir);
std::fs::create_dir_all(&cache_dir).context("Failed to create cache directory")?;
std::fs::create_dir_all(&cache_dir_clone).context("Failed to create cache directory")?;
std::fs::create_dir_all(&mountpoint).context("Failed to create mountpoint")?;
let cas_config = CasConfig {
chunks_dir: cache_dir.join("chunks"),
chunks_dir: cache_dir_clone.join("chunks"),
..Default::default()
};
let store = Arc::new(
@@ -192,6 +219,11 @@ fn run_mount(
check_stale_mount(&mountpoint)?;
let lock_path = cache_dir.join("musicfs.lock");
let _lock = try_acquire_lock(&lock_path)
.context("Failed to acquire lock — is another instance running?")?;
info!(lock_path = ?lock_path, "Lock acquired");
let fs = MusicFs::with_reader(tree, reader, handle.clone());
info!("Mounting filesystem at {:?}", mountpoint);