Implement Phase A: Stop Dying resilience fixes

Implements all 6 critical resilience fixes from phase-a-stop-dying.md:

- Issue 2.9: Migrate std::sync::RwLock → parking_lot::RwLock (7 files)
  Prevents lock poisoning cascade on writer panic

- Issue 2.2: Add install_panic_hook() to log panics via tracing
  Ensures panics are captured in logs/journald before process death

- Issue 3.7: Add ExecStopPost to systemd service
  Cleans up stale FUSE mounts on service stop

- Issue 2.7: Add check_stale_mount() detection on startup
  Auto-cleans leftover mounts from previous crashes

- Issue 2.10: Integrate sd_notify for systemd lifecycle
  Sends READY=1 after mount, STOPPING on shutdown

- Issue 2.1: Add signal handling with spawn_mount
  Catches SIGTERM/SIGINT for clean shutdown instead of instant death

All 7 Phase A tests pass:
- test_poisoned_tree_lock_returns_eio_not_panic
- test_parking_lot_rwlock_survives_panic
- test_panic_hook_logs_to_tracing
- test_systemd_service_has_execstoppost
- test_stale_mount_check_function_exists
- test_sd_notify_ready_sent
- test_sigterm_triggers_shutdown
This commit is contained in:
Alexander
2026-05-13 14:48:32 +02:00
parent 24086cc744
commit 6285eeb6c0
18 changed files with 301 additions and 63 deletions
+1
View File
@@ -12,6 +12,7 @@ tokio = { workspace = true, features = ["sync"] }
tracing.workspace = true
xxhash-rust.workspace = true
hex.workspace = true
parking_lot.workspace = true
[dev-dependencies]
tempfile.workspace = true
+32
View File
@@ -19,6 +19,38 @@ pub fn sanitize_path(path: &Path) -> String {
path.to_string_lossy().to_string()
}
}
/// Install a custom panic hook that logs panics via tracing before the default behavior.
/// This ensures panics are captured in log files and journald.
pub fn install_panic_hook() {
let default_hook = std::panic::take_hook();
std::panic::set_hook(Box::new(move |info| {
let thread = std::thread::current();
let thread_name = thread.name().unwrap_or("<unnamed>");
let message = if let Some(s) = info.payload().downcast_ref::<&str>() {
(*s).to_string()
} else if let Some(s) = info.payload().downcast_ref::<String>() {
s.clone()
} else {
"unknown panic".to_string()
};
let location = info
.location()
.map(|l| format!("{}:{}:{}", l.file(), l.line(), l.column()))
.unwrap_or_else(|| "unknown location".to_string());
tracing::error!(
thread = thread_name,
location = %location,
"PANIC: {}",
message
);
default_hook(info);
}));
}
pub use credentials::{Credential, CredentialConfig, CredentialError, CredentialStore};
pub use error::{Error, Result};
pub use events::{Event, EventBus};
+4 -5
View File
@@ -1,6 +1,6 @@
use parking_lot::RwLock;
use std::collections::HashMap;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::RwLock;
use std::time::Instant;
#[derive(Default)]
@@ -45,7 +45,7 @@ impl Metrics {
self.fuse_ops.open.load(Ordering::Relaxed),
));
for (op, histogram) in self.fuse_latency.histograms.read().unwrap().iter() {
for (op, histogram) in self.fuse_latency.histograms.read().iter() {
let quantiles = histogram.quantiles();
output.push_str(&format!(
"# HELP musicfs_fuse_latency_seconds FUSE operation latency\n\
@@ -95,7 +95,7 @@ impl Metrics {
"# HELP musicfs_origin_health Origin health status (1=healthy, 0=unhealthy)\n\
# TYPE musicfs_origin_health gauge\n",
);
for (origin_id, healthy) in self.origin_health.status.read().unwrap().iter() {
for (origin_id, healthy) in self.origin_health.status.read().iter() {
output.push_str(&format!(
"musicfs_origin_health{{origin=\"{}\"}} {}\n",
origin_id,
@@ -203,7 +203,7 @@ pub struct FuseLatencyMetrics {
impl FuseLatencyMetrics {
pub fn record(&self, op: &str, latency_secs: f64) {
let mut histograms = self.histograms.write().unwrap();
let mut histograms = self.histograms.write();
histograms
.entry(op.to_string())
.or_default()
@@ -268,7 +268,6 @@ impl OriginHealthMetrics {
pub fn set_health(&self, origin_id: &str, healthy: bool) {
self.status
.write()
.unwrap()
.insert(origin_id.to_string(), healthy);
}
}