Add comprehensive logging with tracing, file rotation, and systemd integration

- Add tracing-appender and tracing-journald for production logging - Add LoggingConfig with trace_sample_rate, json_output, journald options - Expand init_logging() with file rotation, journald, and stderr layers - Add sanitize_path() helper for PII protection in logs - Instrument FUSE operations with #[instrument] and trace decision points - Instrument gRPC handlers (10 methods) with span correlation - Add spawn instrumentation for health monitor, indexer, watcher tasks - Add broadcast lag handling (RecvError::Lagged) in event subscribers - Fix webhook.rs expect() calls with proper error handling - Add logging to patterns.rs, collections.rs, artwork.rs database ops - Add Drop impl logging for PluginManager and WatchHandle - Update systemd service with rate limiting and journal output - Add logrotate config and example config.toml with logging section
2026-05-13 11:21:51 +02:00
parent bc9fa36646
commit 5ac33987c0
32 changed files with 1646 additions and 177 deletions
@@ -3,7 +3,7 @@ use crate::traits::Origin;
 use musicfs_core::{Error, RealPath, Result};
 use std::sync::Arc;
 use std::time::Duration;
-use tracing::{debug, warn};
+use tracing::{trace, warn};

 #[derive(Debug, Clone)]
 pub struct RetryConfig {
@@ -79,6 +79,7 @@ impl FailoverExecutor {
        let mut last_error = None;

        for origin in origins {
+            trace!(origin_id = %origin.id(), "Attempting read from origin");
            let start = std::time::Instant::now();
            match self.read_with_retry(&origin, &path.path, offset, size).await {
                Ok(data) => {
@@ -87,7 +88,7 @@ impl FailoverExecutor {
                    return Ok(data);
                }
                Err(e) => {
-                    warn!("Origin {} failed: {}, trying next", origin.id(), e);
+                    warn!(origin_id = %origin.id(), error = %e, "Origin failed, trying next");
                    last_error = Some(e);
                }
            }
@@ -108,13 +109,13 @@ impl FailoverExecutor {
                Ok(data) => return Ok(data),
                Err(e) if attempt + 1 < self.retry_config.max_attempts => {
                    let delay = self.retry_config.delay_for_attempt(attempt);
-                    debug!(
-                        "Retry {}/{} for {} after {:?}: {}",
-                        attempt + 1,
-                        self.retry_config.max_attempts,
-                        origin.id(),
-                        delay,
-                        e
+                    warn!(
+                        origin_id = %origin.id(),
+                        attempt = attempt + 1,
+                        max_attempts = self.retry_config.max_attempts,
+                        error = %e,
+                        delay_ms = delay.as_millis() as u64,
+                        "Retrying read operation"
                    );
                    tokio::time::sleep(delay).await;
                }
@@ -142,6 +143,7 @@ impl FailoverExecutor {
        let mut last_error = None;

        for origin in origins {
+            trace!(origin_id = %origin.id(), "Attempting full read from origin");
            let start = std::time::Instant::now();
            match self.read_full_with_retry(&origin, &path.path).await {
                Ok(data) => {
@@ -150,7 +152,7 @@ impl FailoverExecutor {
                    return Ok(data);
                }
                Err(e) => {
-                    warn!("Origin {} failed full read: {}, trying next", origin.id(), e);
+                    warn!(origin_id = %origin.id(), error = %e, "Origin failed full read, trying next");
                    last_error = Some(e);
                }
            }
@@ -169,13 +171,13 @@ impl FailoverExecutor {
                Ok(data) => return Ok(data),
                Err(e) if attempt + 1 < self.retry_config.max_attempts => {
                    let delay = self.retry_config.delay_for_attempt(attempt);
-                    debug!(
-                        "Retry full read {}/{} for {} after {:?}: {}",
-                        attempt + 1,
-                        self.retry_config.max_attempts,
-                        origin.id(),
-                        delay,
-                        e
+                    warn!(
+                        origin_id = %origin.id(),
+                        attempt = attempt + 1,
+                        max_attempts = self.retry_config.max_attempts,
+                        error = %e,
+                        delay_ms = delay.as_millis() as u64,
+                        "Retrying full read operation"
                    );
                    tokio::time::sleep(delay).await;
                }
@@ -5,7 +5,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio::sync::mpsc;
-use tracing::{debug, info, warn};
+use tracing::{debug, info, info_span, Instrument};

 pub struct HealthMonitor {
    origins: DashMap<OriginId, Arc<dyn Origin>>,
@@ -150,22 +150,32 @@ impl HealthMonitor {
    pub fn start(self: Arc<Self>) -> HealthCheckHandle {
        let (stop_tx, mut stop_rx) = mpsc::channel::<()>(1);
        let monitor = self.clone();
+        let interval_secs = monitor.check_interval.as_secs();

-        tokio::spawn(async move {
-            let mut interval = tokio::time::interval(monitor.check_interval);
+        info!(
+            interval_secs = interval_secs,
+            origin_count = monitor.origins.len(),
+            "Health monitor starting"
+        );

-            loop {
-                tokio::select! {
-                    _ = interval.tick() => {
-                        monitor.check_all().await;
-                    }
-                    _ = stop_rx.recv() => {
-                        info!("Health monitor stopping");
-                        break;
+        tokio::spawn(
+            async move {
+                let mut interval = tokio::time::interval(monitor.check_interval);
+
+                loop {
+                    tokio::select! {
+                        _ = interval.tick() => {
+                            monitor.check_all().await;
+                        }
+                        _ = stop_rx.recv() => {
+                            info!("Health monitor stopping");
+                            break;
+                        }
                    }
                }
            }
-        });
+            .instrument(info_span!("health_monitor")),
+        );

        HealthCheckHandle { stop_tx }
    }
@@ -199,14 +209,24 @@ impl HealthMonitor {
        match status {
            HealthStatus::Healthy => {
                if state.status != HealthStatus::Healthy {
-                    info!("Origin {} is now healthy", id);
+                    info!(
+                        origin_id = %id,
+                        previous_status = ?state.status,
+                        duration_ms = latency_ms,
+                        "Origin health state transition to healthy"
+                    );
                }
                state.status = HealthStatus::Healthy;
                state.consecutive_failures = 0;
            }
            HealthStatus::Degraded => {
                if state.status != HealthStatus::Degraded {
-                    warn!("Origin {} is degraded", id);
+                    info!(
+                        origin_id = %id,
+                        previous_status = ?state.status,
+                        duration_ms = latency_ms,
+                        "Origin health state transition to degraded"
+                    );
                }
                state.status = HealthStatus::Degraded;
            }
@@ -214,16 +234,22 @@ impl HealthMonitor {
                state.consecutive_failures += 1;
                if state.consecutive_failures >= threshold {
                    if state.status != HealthStatus::Unhealthy {
-                        warn!(
-                            "Origin {} is now unhealthy ({} failures)",
-                            id, state.consecutive_failures
+                        info!(
+                            origin_id = %id,
+                            previous_status = ?state.status,
+                            consecutive_failures = state.consecutive_failures,
+                            threshold = threshold,
+                            duration_ms = latency_ms,
+                            "Origin health state transition to unhealthy"
                        );
                    }
                    state.status = HealthStatus::Unhealthy;
                } else {
                    debug!(
-                        "Origin {} check failed ({}/{})",
-                        id, state.consecutive_failures, threshold
+                        origin_id = %id,
+                        consecutive_failures = state.consecutive_failures,
+                        threshold = threshold,
+                        "Origin health check failed"
                    );
                    state.status = HealthStatus::Degraded;
                }
@@ -3,7 +3,7 @@ use dashmap::DashMap;
 use musicfs_core::{Event, EventBus, OriginId};
 use std::sync::Arc;
 use std::time::Instant;
-use tracing::{debug, warn};
+use tracing::{debug, trace, warn};

 pub struct Router {
    priorities: DashMap<OriginId, u8>,
@@ -77,7 +77,7 @@ impl Router {
    }

    pub fn select(&self, candidates: &[OriginId], health: &HealthSnapshot) -> Option<OriginId> {
-        candidates
+        let selected = candidates
            .iter()
            .filter(|id| health.is_healthy(id))
            .min_by_key(|id| {
@@ -85,7 +85,20 @@ impl Router {
                let latency = self.latency_stats.get(*id).map(|s| s.p50_ms).unwrap_or(0);
                (priority, latency)
            })
-            .cloned()
+            .cloned();
+        
+        if let Some(ref id) = selected {
+            let priority = self.get_priority(id);
+            let latency = self.latency_stats.get(id).map(|s| s.p50_ms).unwrap_or(0);
+            trace!(
+                origin_id = %id,
+                priority = priority,
+                latency_ms = latency,
+                "Selected healthy origin"
+            );
+        }
+        
+        selected
    }

    pub fn select_with_fallback(
@@ -104,6 +117,11 @@ impl Router {
            .min_by_key(|id| self.get_priority(id))
            .cloned()
        {
+            trace!(
+                origin_id = %id,
+                priority = self.get_priority(&id),
+                "Selected degraded origin as fallback"
+            );
            return Some(id);
        }

@@ -115,14 +133,26 @@ impl Router {
            });
        }

-        candidates
+        let selected = candidates
            .iter()
            .min_by_key(|id| {
                let failures = health.failure_count(id).unwrap_or(u32::MAX);
                let priority = self.get_priority(id);
                (failures, priority)
            })
-            .cloned()
+            .cloned();
+        
+        if let Some(ref id) = selected {
+            let failures = health.failure_count(id).unwrap_or(u32::MAX);
+            trace!(
+                origin_id = %id,
+                failure_count = failures,
+                priority = self.get_priority(id),
+                "Selected least-bad unhealthy origin"
+            );
+        }
+        
+        selected
    }
 }