Add comprehensive logging with tracing, file rotation, and systemd integration
- Add tracing-appender and tracing-journald for production logging - Add LoggingConfig with trace_sample_rate, json_output, journald options - Expand init_logging() with file rotation, journald, and stderr layers - Add sanitize_path() helper for PII protection in logs - Instrument FUSE operations with #[instrument] and trace decision points - Instrument gRPC handlers (10 methods) with span correlation - Add spawn instrumentation for health monitor, indexer, watcher tasks - Add broadcast lag handling (RecvError::Lagged) in event subscribers - Fix webhook.rs expect() calls with proper error handling - Add logging to patterns.rs, collections.rs, artwork.rs database ops - Add Drop impl logging for PluginManager and WatchHandle - Update systemd service with rate limiting and journal output - Add logrotate config and example config.toml with logging section
This commit is contained in:
@@ -3,7 +3,7 @@ use crate::traits::Origin;
|
||||
use musicfs_core::{Error, RealPath, Result};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tracing::{debug, warn};
|
||||
use tracing::{trace, warn};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RetryConfig {
|
||||
@@ -79,6 +79,7 @@ impl FailoverExecutor {
|
||||
let mut last_error = None;
|
||||
|
||||
for origin in origins {
|
||||
trace!(origin_id = %origin.id(), "Attempting read from origin");
|
||||
let start = std::time::Instant::now();
|
||||
match self.read_with_retry(&origin, &path.path, offset, size).await {
|
||||
Ok(data) => {
|
||||
@@ -87,7 +88,7 @@ impl FailoverExecutor {
|
||||
return Ok(data);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Origin {} failed: {}, trying next", origin.id(), e);
|
||||
warn!(origin_id = %origin.id(), error = %e, "Origin failed, trying next");
|
||||
last_error = Some(e);
|
||||
}
|
||||
}
|
||||
@@ -108,13 +109,13 @@ impl FailoverExecutor {
|
||||
Ok(data) => return Ok(data),
|
||||
Err(e) if attempt + 1 < self.retry_config.max_attempts => {
|
||||
let delay = self.retry_config.delay_for_attempt(attempt);
|
||||
debug!(
|
||||
"Retry {}/{} for {} after {:?}: {}",
|
||||
attempt + 1,
|
||||
self.retry_config.max_attempts,
|
||||
origin.id(),
|
||||
delay,
|
||||
e
|
||||
warn!(
|
||||
origin_id = %origin.id(),
|
||||
attempt = attempt + 1,
|
||||
max_attempts = self.retry_config.max_attempts,
|
||||
error = %e,
|
||||
delay_ms = delay.as_millis() as u64,
|
||||
"Retrying read operation"
|
||||
);
|
||||
tokio::time::sleep(delay).await;
|
||||
}
|
||||
@@ -142,6 +143,7 @@ impl FailoverExecutor {
|
||||
let mut last_error = None;
|
||||
|
||||
for origin in origins {
|
||||
trace!(origin_id = %origin.id(), "Attempting full read from origin");
|
||||
let start = std::time::Instant::now();
|
||||
match self.read_full_with_retry(&origin, &path.path).await {
|
||||
Ok(data) => {
|
||||
@@ -150,7 +152,7 @@ impl FailoverExecutor {
|
||||
return Ok(data);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Origin {} failed full read: {}, trying next", origin.id(), e);
|
||||
warn!(origin_id = %origin.id(), error = %e, "Origin failed full read, trying next");
|
||||
last_error = Some(e);
|
||||
}
|
||||
}
|
||||
@@ -169,13 +171,13 @@ impl FailoverExecutor {
|
||||
Ok(data) => return Ok(data),
|
||||
Err(e) if attempt + 1 < self.retry_config.max_attempts => {
|
||||
let delay = self.retry_config.delay_for_attempt(attempt);
|
||||
debug!(
|
||||
"Retry full read {}/{} for {} after {:?}: {}",
|
||||
attempt + 1,
|
||||
self.retry_config.max_attempts,
|
||||
origin.id(),
|
||||
delay,
|
||||
e
|
||||
warn!(
|
||||
origin_id = %origin.id(),
|
||||
attempt = attempt + 1,
|
||||
max_attempts = self.retry_config.max_attempts,
|
||||
error = %e,
|
||||
delay_ms = delay.as_millis() as u64,
|
||||
"Retrying full read operation"
|
||||
);
|
||||
tokio::time::sleep(delay).await;
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::{debug, info, warn};
|
||||
use tracing::{debug, info, info_span, Instrument};
|
||||
|
||||
pub struct HealthMonitor {
|
||||
origins: DashMap<OriginId, Arc<dyn Origin>>,
|
||||
@@ -150,22 +150,32 @@ impl HealthMonitor {
|
||||
pub fn start(self: Arc<Self>) -> HealthCheckHandle {
|
||||
let (stop_tx, mut stop_rx) = mpsc::channel::<()>(1);
|
||||
let monitor = self.clone();
|
||||
let interval_secs = monitor.check_interval.as_secs();
|
||||
|
||||
tokio::spawn(async move {
|
||||
let mut interval = tokio::time::interval(monitor.check_interval);
|
||||
info!(
|
||||
interval_secs = interval_secs,
|
||||
origin_count = monitor.origins.len(),
|
||||
"Health monitor starting"
|
||||
);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = interval.tick() => {
|
||||
monitor.check_all().await;
|
||||
}
|
||||
_ = stop_rx.recv() => {
|
||||
info!("Health monitor stopping");
|
||||
break;
|
||||
tokio::spawn(
|
||||
async move {
|
||||
let mut interval = tokio::time::interval(monitor.check_interval);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = interval.tick() => {
|
||||
monitor.check_all().await;
|
||||
}
|
||||
_ = stop_rx.recv() => {
|
||||
info!("Health monitor stopping");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
.instrument(info_span!("health_monitor")),
|
||||
);
|
||||
|
||||
HealthCheckHandle { stop_tx }
|
||||
}
|
||||
@@ -199,14 +209,24 @@ impl HealthMonitor {
|
||||
match status {
|
||||
HealthStatus::Healthy => {
|
||||
if state.status != HealthStatus::Healthy {
|
||||
info!("Origin {} is now healthy", id);
|
||||
info!(
|
||||
origin_id = %id,
|
||||
previous_status = ?state.status,
|
||||
duration_ms = latency_ms,
|
||||
"Origin health state transition to healthy"
|
||||
);
|
||||
}
|
||||
state.status = HealthStatus::Healthy;
|
||||
state.consecutive_failures = 0;
|
||||
}
|
||||
HealthStatus::Degraded => {
|
||||
if state.status != HealthStatus::Degraded {
|
||||
warn!("Origin {} is degraded", id);
|
||||
info!(
|
||||
origin_id = %id,
|
||||
previous_status = ?state.status,
|
||||
duration_ms = latency_ms,
|
||||
"Origin health state transition to degraded"
|
||||
);
|
||||
}
|
||||
state.status = HealthStatus::Degraded;
|
||||
}
|
||||
@@ -214,16 +234,22 @@ impl HealthMonitor {
|
||||
state.consecutive_failures += 1;
|
||||
if state.consecutive_failures >= threshold {
|
||||
if state.status != HealthStatus::Unhealthy {
|
||||
warn!(
|
||||
"Origin {} is now unhealthy ({} failures)",
|
||||
id, state.consecutive_failures
|
||||
info!(
|
||||
origin_id = %id,
|
||||
previous_status = ?state.status,
|
||||
consecutive_failures = state.consecutive_failures,
|
||||
threshold = threshold,
|
||||
duration_ms = latency_ms,
|
||||
"Origin health state transition to unhealthy"
|
||||
);
|
||||
}
|
||||
state.status = HealthStatus::Unhealthy;
|
||||
} else {
|
||||
debug!(
|
||||
"Origin {} check failed ({}/{})",
|
||||
id, state.consecutive_failures, threshold
|
||||
origin_id = %id,
|
||||
consecutive_failures = state.consecutive_failures,
|
||||
threshold = threshold,
|
||||
"Origin health check failed"
|
||||
);
|
||||
state.status = HealthStatus::Degraded;
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use dashmap::DashMap;
|
||||
use musicfs_core::{Event, EventBus, OriginId};
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
use tracing::{debug, warn};
|
||||
use tracing::{debug, trace, warn};
|
||||
|
||||
pub struct Router {
|
||||
priorities: DashMap<OriginId, u8>,
|
||||
@@ -77,7 +77,7 @@ impl Router {
|
||||
}
|
||||
|
||||
pub fn select(&self, candidates: &[OriginId], health: &HealthSnapshot) -> Option<OriginId> {
|
||||
candidates
|
||||
let selected = candidates
|
||||
.iter()
|
||||
.filter(|id| health.is_healthy(id))
|
||||
.min_by_key(|id| {
|
||||
@@ -85,7 +85,20 @@ impl Router {
|
||||
let latency = self.latency_stats.get(*id).map(|s| s.p50_ms).unwrap_or(0);
|
||||
(priority, latency)
|
||||
})
|
||||
.cloned()
|
||||
.cloned();
|
||||
|
||||
if let Some(ref id) = selected {
|
||||
let priority = self.get_priority(id);
|
||||
let latency = self.latency_stats.get(id).map(|s| s.p50_ms).unwrap_or(0);
|
||||
trace!(
|
||||
origin_id = %id,
|
||||
priority = priority,
|
||||
latency_ms = latency,
|
||||
"Selected healthy origin"
|
||||
);
|
||||
}
|
||||
|
||||
selected
|
||||
}
|
||||
|
||||
pub fn select_with_fallback(
|
||||
@@ -104,6 +117,11 @@ impl Router {
|
||||
.min_by_key(|id| self.get_priority(id))
|
||||
.cloned()
|
||||
{
|
||||
trace!(
|
||||
origin_id = %id,
|
||||
priority = self.get_priority(&id),
|
||||
"Selected degraded origin as fallback"
|
||||
);
|
||||
return Some(id);
|
||||
}
|
||||
|
||||
@@ -115,14 +133,26 @@ impl Router {
|
||||
});
|
||||
}
|
||||
|
||||
candidates
|
||||
let selected = candidates
|
||||
.iter()
|
||||
.min_by_key(|id| {
|
||||
let failures = health.failure_count(id).unwrap_or(u32::MAX);
|
||||
let priority = self.get_priority(id);
|
||||
(failures, priority)
|
||||
})
|
||||
.cloned()
|
||||
.cloned();
|
||||
|
||||
if let Some(ref id) = selected {
|
||||
let failures = health.failure_count(id).unwrap_or(u32::MAX);
|
||||
trace!(
|
||||
origin_id = %id,
|
||||
failure_count = failures,
|
||||
priority = self.get_priority(id),
|
||||
"Selected least-bad unhealthy origin"
|
||||
);
|
||||
}
|
||||
|
||||
selected
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user