Implement Phase A: Stop Dying resilience fixes
Implements all 6 critical resilience fixes from phase-a-stop-dying.md: - Issue 2.9: Migrate std::sync::RwLock → parking_lot::RwLock (7 files) Prevents lock poisoning cascade on writer panic - Issue 2.2: Add install_panic_hook() to log panics via tracing Ensures panics are captured in logs/journald before process death - Issue 3.7: Add ExecStopPost to systemd service Cleans up stale FUSE mounts on service stop - Issue 2.7: Add check_stale_mount() detection on startup Auto-cleans leftover mounts from previous crashes - Issue 2.10: Integrate sd_notify for systemd lifecycle Sends READY=1 after mount, STOPPING on shutdown - Issue 2.1: Add signal handling with spawn_mount Catches SIGTERM/SIGINT for clean shutdown instead of instant death All 7 Phase A tests pass: - test_poisoned_tree_lock_returns_eio_not_panic - test_parking_lot_rwlock_survives_panic - test_panic_hook_logs_to_tracing - test_systemd_service_has_execstoppost - test_stale_mount_check_function_exists - test_sd_notify_ready_sent - test_sigterm_triggers_shutdown
This commit is contained in:
@@ -12,6 +12,7 @@ tokio = { workspace = true, features = ["sync"] }
|
||||
tracing.workspace = true
|
||||
xxhash-rust.workspace = true
|
||||
hex.workspace = true
|
||||
parking_lot.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile.workspace = true
|
||||
|
||||
@@ -19,6 +19,38 @@ pub fn sanitize_path(path: &Path) -> String {
|
||||
path.to_string_lossy().to_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// Install a custom panic hook that logs panics via tracing before the default behavior.
|
||||
/// This ensures panics are captured in log files and journald.
|
||||
pub fn install_panic_hook() {
|
||||
let default_hook = std::panic::take_hook();
|
||||
std::panic::set_hook(Box::new(move |info| {
|
||||
let thread = std::thread::current();
|
||||
let thread_name = thread.name().unwrap_or("<unnamed>");
|
||||
|
||||
let message = if let Some(s) = info.payload().downcast_ref::<&str>() {
|
||||
(*s).to_string()
|
||||
} else if let Some(s) = info.payload().downcast_ref::<String>() {
|
||||
s.clone()
|
||||
} else {
|
||||
"unknown panic".to_string()
|
||||
};
|
||||
|
||||
let location = info
|
||||
.location()
|
||||
.map(|l| format!("{}:{}:{}", l.file(), l.line(), l.column()))
|
||||
.unwrap_or_else(|| "unknown location".to_string());
|
||||
|
||||
tracing::error!(
|
||||
thread = thread_name,
|
||||
location = %location,
|
||||
"PANIC: {}",
|
||||
message
|
||||
);
|
||||
|
||||
default_hook(info);
|
||||
}));
|
||||
}
|
||||
pub use credentials::{Credential, CredentialConfig, CredentialError, CredentialStore};
|
||||
pub use error::{Error, Result};
|
||||
pub use events::{Event, EventBus};
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use parking_lot::RwLock;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::RwLock;
|
||||
use std::time::Instant;
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -45,7 +45,7 @@ impl Metrics {
|
||||
self.fuse_ops.open.load(Ordering::Relaxed),
|
||||
));
|
||||
|
||||
for (op, histogram) in self.fuse_latency.histograms.read().unwrap().iter() {
|
||||
for (op, histogram) in self.fuse_latency.histograms.read().iter() {
|
||||
let quantiles = histogram.quantiles();
|
||||
output.push_str(&format!(
|
||||
"# HELP musicfs_fuse_latency_seconds FUSE operation latency\n\
|
||||
@@ -95,7 +95,7 @@ impl Metrics {
|
||||
"# HELP musicfs_origin_health Origin health status (1=healthy, 0=unhealthy)\n\
|
||||
# TYPE musicfs_origin_health gauge\n",
|
||||
);
|
||||
for (origin_id, healthy) in self.origin_health.status.read().unwrap().iter() {
|
||||
for (origin_id, healthy) in self.origin_health.status.read().iter() {
|
||||
output.push_str(&format!(
|
||||
"musicfs_origin_health{{origin=\"{}\"}} {}\n",
|
||||
origin_id,
|
||||
@@ -203,7 +203,7 @@ pub struct FuseLatencyMetrics {
|
||||
|
||||
impl FuseLatencyMetrics {
|
||||
pub fn record(&self, op: &str, latency_secs: f64) {
|
||||
let mut histograms = self.histograms.write().unwrap();
|
||||
let mut histograms = self.histograms.write();
|
||||
histograms
|
||||
.entry(op.to_string())
|
||||
.or_default()
|
||||
@@ -268,7 +268,6 @@ impl OriginHealthMetrics {
|
||||
pub fn set_health(&self, origin_id: &str, healthy: bool) {
|
||||
self.status
|
||||
.write()
|
||||
.unwrap()
|
||||
.insert(origin_id.to_string(), healthy);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user