Add comprehensive logging with tracing, file rotation, and systemd integration

- Add tracing-appender and tracing-journald for production logging
- Add LoggingConfig with trace_sample_rate, json_output, journald options
- Expand init_logging() with file rotation, journald, and stderr layers
- Add sanitize_path() helper for PII protection in logs
- Instrument FUSE operations with #[instrument] and trace decision points
- Instrument gRPC handlers (10 methods) with span correlation
- Add spawn instrumentation for health monitor, indexer, watcher tasks
- Add broadcast lag handling (RecvError::Lagged) in event subscribers
- Fix webhook.rs expect() calls with proper error handling
- Add logging to patterns.rs, collections.rs, artwork.rs database ops
- Add Drop impl logging for PluginManager and WatchHandle
- Update systemd service with rate limiting and journal output
- Add logrotate config and example config.toml with logging section
This commit is contained in:
Alexander
2026-05-13 11:21:51 +02:00
parent bc9fa36646
commit 5ac33987c0
32 changed files with 1646 additions and 177 deletions
+19 -17
View File
@@ -3,7 +3,7 @@ use crate::traits::Origin;
use musicfs_core::{Error, RealPath, Result};
use std::sync::Arc;
use std::time::Duration;
use tracing::{debug, warn};
use tracing::{trace, warn};
#[derive(Debug, Clone)]
pub struct RetryConfig {
@@ -79,6 +79,7 @@ impl FailoverExecutor {
let mut last_error = None;
for origin in origins {
trace!(origin_id = %origin.id(), "Attempting read from origin");
let start = std::time::Instant::now();
match self.read_with_retry(&origin, &path.path, offset, size).await {
Ok(data) => {
@@ -87,7 +88,7 @@ impl FailoverExecutor {
return Ok(data);
}
Err(e) => {
warn!("Origin {} failed: {}, trying next", origin.id(), e);
warn!(origin_id = %origin.id(), error = %e, "Origin failed, trying next");
last_error = Some(e);
}
}
@@ -108,13 +109,13 @@ impl FailoverExecutor {
Ok(data) => return Ok(data),
Err(e) if attempt + 1 < self.retry_config.max_attempts => {
let delay = self.retry_config.delay_for_attempt(attempt);
debug!(
"Retry {}/{} for {} after {:?}: {}",
attempt + 1,
self.retry_config.max_attempts,
origin.id(),
delay,
e
warn!(
origin_id = %origin.id(),
attempt = attempt + 1,
max_attempts = self.retry_config.max_attempts,
error = %e,
delay_ms = delay.as_millis() as u64,
"Retrying read operation"
);
tokio::time::sleep(delay).await;
}
@@ -142,6 +143,7 @@ impl FailoverExecutor {
let mut last_error = None;
for origin in origins {
trace!(origin_id = %origin.id(), "Attempting full read from origin");
let start = std::time::Instant::now();
match self.read_full_with_retry(&origin, &path.path).await {
Ok(data) => {
@@ -150,7 +152,7 @@ impl FailoverExecutor {
return Ok(data);
}
Err(e) => {
warn!("Origin {} failed full read: {}, trying next", origin.id(), e);
warn!(origin_id = %origin.id(), error = %e, "Origin failed full read, trying next");
last_error = Some(e);
}
}
@@ -169,13 +171,13 @@ impl FailoverExecutor {
Ok(data) => return Ok(data),
Err(e) if attempt + 1 < self.retry_config.max_attempts => {
let delay = self.retry_config.delay_for_attempt(attempt);
debug!(
"Retry full read {}/{} for {} after {:?}: {}",
attempt + 1,
self.retry_config.max_attempts,
origin.id(),
delay,
e
warn!(
origin_id = %origin.id(),
attempt = attempt + 1,
max_attempts = self.retry_config.max_attempts,
error = %e,
delay_ms = delay.as_millis() as u64,
"Retrying full read operation"
);
tokio::time::sleep(delay).await;
}
+45 -19
View File
@@ -5,7 +5,7 @@ use std::collections::HashMap;
use std::sync::Arc;
use std::time::{Duration, Instant};
use tokio::sync::mpsc;
use tracing::{debug, info, warn};
use tracing::{debug, info, info_span, Instrument};
pub struct HealthMonitor {
origins: DashMap<OriginId, Arc<dyn Origin>>,
@@ -150,22 +150,32 @@ impl HealthMonitor {
pub fn start(self: Arc<Self>) -> HealthCheckHandle {
let (stop_tx, mut stop_rx) = mpsc::channel::<()>(1);
let monitor = self.clone();
let interval_secs = monitor.check_interval.as_secs();
tokio::spawn(async move {
let mut interval = tokio::time::interval(monitor.check_interval);
info!(
interval_secs = interval_secs,
origin_count = monitor.origins.len(),
"Health monitor starting"
);
loop {
tokio::select! {
_ = interval.tick() => {
monitor.check_all().await;
}
_ = stop_rx.recv() => {
info!("Health monitor stopping");
break;
tokio::spawn(
async move {
let mut interval = tokio::time::interval(monitor.check_interval);
loop {
tokio::select! {
_ = interval.tick() => {
monitor.check_all().await;
}
_ = stop_rx.recv() => {
info!("Health monitor stopping");
break;
}
}
}
}
});
.instrument(info_span!("health_monitor")),
);
HealthCheckHandle { stop_tx }
}
@@ -199,14 +209,24 @@ impl HealthMonitor {
match status {
HealthStatus::Healthy => {
if state.status != HealthStatus::Healthy {
info!("Origin {} is now healthy", id);
info!(
origin_id = %id,
previous_status = ?state.status,
duration_ms = latency_ms,
"Origin health state transition to healthy"
);
}
state.status = HealthStatus::Healthy;
state.consecutive_failures = 0;
}
HealthStatus::Degraded => {
if state.status != HealthStatus::Degraded {
warn!("Origin {} is degraded", id);
info!(
origin_id = %id,
previous_status = ?state.status,
duration_ms = latency_ms,
"Origin health state transition to degraded"
);
}
state.status = HealthStatus::Degraded;
}
@@ -214,16 +234,22 @@ impl HealthMonitor {
state.consecutive_failures += 1;
if state.consecutive_failures >= threshold {
if state.status != HealthStatus::Unhealthy {
warn!(
"Origin {} is now unhealthy ({} failures)",
id, state.consecutive_failures
info!(
origin_id = %id,
previous_status = ?state.status,
consecutive_failures = state.consecutive_failures,
threshold = threshold,
duration_ms = latency_ms,
"Origin health state transition to unhealthy"
);
}
state.status = HealthStatus::Unhealthy;
} else {
debug!(
"Origin {} check failed ({}/{})",
id, state.consecutive_failures, threshold
origin_id = %id,
consecutive_failures = state.consecutive_failures,
threshold = threshold,
"Origin health check failed"
);
state.status = HealthStatus::Degraded;
}
+35 -5
View File
@@ -3,7 +3,7 @@ use dashmap::DashMap;
use musicfs_core::{Event, EventBus, OriginId};
use std::sync::Arc;
use std::time::Instant;
use tracing::{debug, warn};
use tracing::{debug, trace, warn};
pub struct Router {
priorities: DashMap<OriginId, u8>,
@@ -77,7 +77,7 @@ impl Router {
}
pub fn select(&self, candidates: &[OriginId], health: &HealthSnapshot) -> Option<OriginId> {
candidates
let selected = candidates
.iter()
.filter(|id| health.is_healthy(id))
.min_by_key(|id| {
@@ -85,7 +85,20 @@ impl Router {
let latency = self.latency_stats.get(*id).map(|s| s.p50_ms).unwrap_or(0);
(priority, latency)
})
.cloned()
.cloned();
if let Some(ref id) = selected {
let priority = self.get_priority(id);
let latency = self.latency_stats.get(id).map(|s| s.p50_ms).unwrap_or(0);
trace!(
origin_id = %id,
priority = priority,
latency_ms = latency,
"Selected healthy origin"
);
}
selected
}
pub fn select_with_fallback(
@@ -104,6 +117,11 @@ impl Router {
.min_by_key(|id| self.get_priority(id))
.cloned()
{
trace!(
origin_id = %id,
priority = self.get_priority(&id),
"Selected degraded origin as fallback"
);
return Some(id);
}
@@ -115,14 +133,26 @@ impl Router {
});
}
candidates
let selected = candidates
.iter()
.min_by_key(|id| {
let failures = health.failure_count(id).unwrap_or(u32::MAX);
let priority = self.get_priority(id);
(failures, priority)
})
.cloned()
.cloned();
if let Some(ref id) = selected {
let failures = health.failure_count(id).unwrap_or(u32::MAX);
trace!(
origin_id = %id,
failure_count = failures,
priority = self.get_priority(id),
"Selected least-bad unhealthy origin"
);
}
selected
}
}