Move the files around

This commit is contained in:
Alexander
2026-05-13 20:34:14 +02:00
parent 90e9683076
commit 305d027c8b
113 changed files with 650 additions and 3569 deletions
+22
View File
@@ -0,0 +1,22 @@
[package]
name = "musicfs-sync"
version.workspace = true
edition.workspace = true
[dependencies]
musicfs-core = { path = "../musicfs-core" }
musicfs-origins = { path = "../musicfs-origins" }
fastcdc = "3"
xxhash-rust = { version = "0.8", features = ["xxh64"] }
notify = "6"
rmp-serde = "1"
tokio = { workspace = true }
tracing = { workspace = true }
thiserror = { workspace = true }
serde = { workspace = true }
async-trait = { workspace = true }
[dev-dependencies]
tempfile = { workspace = true }
+232
View File
@@ -0,0 +1,232 @@
use fastcdc::v2020::FastCDC;
use musicfs_core::ChunkHash;
pub struct CdcChunker {
min_size: u32,
avg_size: u32,
max_size: u32,
}
impl Default for CdcChunker {
fn default() -> Self {
Self {
min_size: 16 * 1024,
avg_size: 64 * 1024,
max_size: 256 * 1024,
}
}
}
#[derive(Debug, Clone)]
pub struct Chunk {
pub hash: ChunkHash,
pub offset: u64,
pub length: u32,
pub data: Vec<u8>,
}
#[derive(Debug)]
pub struct ChunkRef<'a> {
pub hash: ChunkHash,
pub offset: u64,
pub length: u32,
pub data: &'a [u8],
}
impl CdcChunker {
pub fn new(min_size: u32, avg_size: u32, max_size: u32) -> Self {
Self {
min_size,
avg_size,
max_size,
}
}
pub fn chunk(&self, data: &[u8]) -> Vec<Chunk> {
let chunker = FastCDC::new(data, self.min_size, self.avg_size, self.max_size);
chunker
.map(|c| {
let chunk_data = &data[c.offset..c.offset + c.length];
Chunk {
hash: ChunkHash::from_bytes(chunk_data),
offset: c.offset as u64,
length: c.length as u32,
data: chunk_data.to_vec(),
}
})
.collect()
}
pub fn chunk_refs<'a>(&self, data: &'a [u8]) -> Vec<ChunkRef<'a>> {
let chunker = FastCDC::new(data, self.min_size, self.avg_size, self.max_size);
chunker
.map(|c| {
let chunk_data = &data[c.offset..c.offset + c.length];
ChunkRef {
hash: ChunkHash::from_bytes(chunk_data),
offset: c.offset as u64,
length: c.length as u32,
data: chunk_data,
}
})
.collect()
}
pub fn chunk_streaming<F>(&self, data: &[u8], mut processor: F) -> usize
where
F: FnMut(ChunkRef<'_>),
{
let chunker = FastCDC::new(data, self.min_size, self.avg_size, self.max_size);
let mut count = 0;
for c in chunker {
let chunk_data = &data[c.offset..c.offset + c.length];
processor(ChunkRef {
hash: ChunkHash::from_bytes(chunk_data),
offset: c.offset as u64,
length: c.length as u32,
data: chunk_data,
});
count += 1;
}
count
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cdc_basic() {
let chunker = CdcChunker::default();
let data = vec![0u8; 256 * 1024];
let chunks = chunker.chunk(&data);
assert!(!chunks.is_empty());
let total: u64 = chunks.iter().map(|c| c.length as u64).sum();
assert_eq!(total, data.len() as u64);
let mut offset = 0u64;
for chunk in &chunks {
assert_eq!(chunk.offset, offset);
offset += chunk.length as u64;
}
}
#[test]
fn test_cdc_stable_boundaries() {
let chunker = CdcChunker::new(4 * 1024, 16 * 1024, 64 * 1024);
let mut data1 = vec![0u8; 512 * 1024];
for (i, b) in data1.iter_mut().enumerate() {
*b = ((i * 17 + 31) % 256) as u8;
}
let mut data2 = vec![0xFFu8; 1024];
data2.extend_from_slice(&data1);
let chunks1 = chunker.chunk(&data1);
let chunks2 = chunker.chunk(&data2);
let hashes1: std::collections::HashSet<_> = chunks1.iter().map(|c| c.hash).collect();
let hashes2: std::collections::HashSet<_> = chunks2.iter().map(|c| c.hash).collect();
let shared = hashes1.intersection(&hashes2).count();
assert!(
shared > 0,
"CDC should produce stable boundaries, got {} chunks in original, {} after prepend",
chunks1.len(),
chunks2.len()
);
}
#[test]
fn test_cdc_chunk_sizes() {
let chunker = CdcChunker::default();
let data: Vec<u8> = (0..1024 * 1024)
.map(|i| ((i * 17 + 31) % 256) as u8)
.collect();
let chunks = chunker.chunk(&data);
for chunk in &chunks {
if chunk.offset + chunk.length as u64 != data.len() as u64 {
assert!(
chunk.length >= chunker.min_size / 2,
"Chunk too small: {}",
chunk.length
);
assert!(
chunk.length <= chunker.max_size * 2,
"Chunk too large: {}",
chunk.length
);
}
}
}
#[test]
fn test_cdc_streaming() {
let chunker = CdcChunker::default();
let data = vec![0u8; 256 * 1024];
let mut streamed = Vec::new();
let count = chunker.chunk_streaming(&data, |chunk| {
streamed.push((chunk.hash, chunk.offset, chunk.length));
});
let batched = chunker.chunk(&data);
assert_eq!(count, batched.len());
for (i, chunk) in batched.iter().enumerate() {
assert_eq!(streamed[i].0, chunk.hash);
assert_eq!(streamed[i].1, chunk.offset);
assert_eq!(streamed[i].2, chunk.length);
}
}
#[test]
fn test_bandwidth_reduction_metadata_edit() {
let chunker = CdcChunker::new(4 * 1024, 16 * 1024, 64 * 1024);
let mut state = 12345u64;
let original: Vec<u8> = (0..2 * 1024 * 1024)
.map(|_| {
state = state.wrapping_mul(6364136223846793005).wrapping_add(1);
(state >> 56) as u8
})
.collect();
let chunks1 = chunker.chunk(&original);
let hashes1: std::collections::HashSet<_> = chunks1.iter().map(|c| c.hash).collect();
let mut modified = original.clone();
let mid = modified.len() / 2;
for i in mid..mid + 100 {
modified[i] = 0xFF;
}
let chunks2 = chunker.chunk(&modified);
let hashes2: std::collections::HashSet<_> = chunks2.iter().map(|c| c.hash).collect();
let reused = hashes1.intersection(&hashes2).count();
let reuse_ratio = reused as f64 / chunks2.len() as f64;
// NFR-6.4 requires >90% bandwidth reduction for typical edits
assert!(
reuse_ratio > 0.90,
"Expected >90% chunk reuse for mid-file edit (NFR-6.4). Reused {}/{} chunks ({:.1}%, total {} original)",
reused,
chunks2.len(),
reuse_ratio * 100.0,
chunks1.len()
);
}
}
+338
View File
@@ -0,0 +1,338 @@
use crate::cdc::CdcChunker;
use musicfs_core::{ChunkHash, FileId, FileMeta, OriginId};
use musicfs_origins::Origin;
use std::collections::{HashMap, HashSet};
use std::path::PathBuf;
use std::time::SystemTime;
use tracing::{debug, info, trace};
#[derive(Debug, Clone)]
pub struct ScannedFile {
pub path: PathBuf,
pub origin_id: OriginId,
pub size: u64,
pub mtime: SystemTime,
}
#[derive(Debug, Default)]
pub struct ChangeSet {
pub added: Vec<ScannedFile>,
pub removed: Vec<FileId>,
pub modified: Vec<(FileId, ManifestDiff)>,
}
impl ChangeSet {
pub fn is_empty(&self) -> bool {
self.added.is_empty() && self.removed.is_empty() && self.modified.is_empty()
}
pub fn total_changes(&self) -> usize {
self.added.len() + self.removed.len() + self.modified.len()
}
}
#[derive(Debug, Clone)]
pub struct ManifestChunk {
pub hash: ChunkHash,
pub offset: u64,
pub size: u32,
}
#[derive(Debug)]
pub struct ManifestDiff {
pub reuse: Vec<ManifestChunk>,
pub fetch: Vec<ManifestChunk>,
pub orphaned: Vec<ChunkHash>,
}
pub struct DeltaDetector {
chunker: CdcChunker,
}
impl DeltaDetector {
pub fn new() -> Self {
Self {
chunker: CdcChunker::default(),
}
}
pub fn with_chunker(chunker: CdcChunker) -> Self {
Self { chunker }
}
pub async fn detect_changes(
&self,
origin: &dyn Origin,
cached: &HashMap<FileId, FileMeta>,
manifests: &HashMap<FileId, Vec<ManifestChunk>>,
) -> Result<ChangeSet, DeltaError> {
let origin_id = origin.id().clone();
info!(origin_id = %origin_id, "Starting delta detection");
let mut changes = ChangeSet::default();
let origin_files = self.scan_origin(origin).await?;
trace!(origin_id = %origin_id, scanned_count = origin_files.len(), "Completed origin scan");
let cached_by_path: HashMap<_, _> = cached
.values()
.map(|m| (m.real_path.path.clone(), m))
.collect();
for scanned in &origin_files {
if let Some(cached_file) = cached_by_path.get(&scanned.path) {
if self.is_modified_scan(cached_file, scanned) {
debug!(origin_id = %origin_id, path = ?scanned.path, "File modified");
if let Some(old_chunks) = manifests.get(&cached_file.id) {
let new_chunks = self.compute_chunks_for_scan(origin, scanned).await?;
let diff = self.compute_diff(old_chunks, &new_chunks);
changes.modified.push((cached_file.id, diff));
}
}
} else {
debug!(origin_id = %origin_id, path = ?scanned.path, "File added");
changes.added.push(scanned.clone());
}
}
let origin_paths: HashSet<_> = origin_files.iter().map(|f| &f.path).collect();
for cached_file in cached.values() {
if !origin_paths.contains(&cached_file.real_path.path) {
debug!(origin_id = %origin_id, path = ?cached_file.real_path.path, "File removed");
changes.removed.push(cached_file.id);
}
}
info!(
origin_id = %origin_id,
files_added = changes.added.len(),
files_removed = changes.removed.len(),
files_modified = changes.modified.len(),
"Delta detection complete"
);
Ok(changes)
}
fn is_modified_scan(&self, cached: &FileMeta, scanned: &ScannedFile) -> bool {
cached.size != scanned.size || cached.mtime != scanned.mtime
}
async fn scan_origin(&self, origin: &dyn Origin) -> Result<Vec<ScannedFile>, DeltaError> {
let mut files = Vec::new();
let mut dirs_to_scan = vec![PathBuf::from("/")];
while let Some(dir) = dirs_to_scan.pop() {
let entries = origin
.readdir(&dir)
.await
.map_err(|e| DeltaError::OriginScan(e.to_string()))?;
for entry in entries {
let entry_path = dir.join(&entry.name);
if entry.is_dir {
dirs_to_scan.push(entry_path);
} else if Self::is_audio_file(&entry.name) {
let stat = origin
.stat(&entry_path)
.await
.map_err(|e| DeltaError::OriginScan(e.to_string()))?;
files.push(ScannedFile {
path: entry_path,
origin_id: origin.id().clone(),
size: stat.size,
mtime: stat.mtime,
});
}
}
}
Ok(files)
}
fn is_audio_file(name: &str) -> bool {
let lower = name.to_lowercase();
lower.ends_with(".flac")
|| lower.ends_with(".mp3")
|| lower.ends_with(".ogg")
|| lower.ends_with(".wav")
|| lower.ends_with(".m4a")
|| lower.ends_with(".aac")
|| lower.ends_with(".opus")
}
async fn compute_chunks_for_scan(
&self,
origin: &dyn Origin,
scanned: &ScannedFile,
) -> Result<Vec<ManifestChunk>, DeltaError> {
let data = origin
.read_full(&scanned.path)
.await
.map_err(|e| DeltaError::OriginRead(e.to_string()))?;
let chunks = self.chunker.chunk_refs(&data);
Ok(chunks
.into_iter()
.map(|c| ManifestChunk {
hash: c.hash,
offset: c.offset,
size: c.length,
})
.collect())
}
fn compute_diff(
&self,
old_chunks: &[ManifestChunk],
new_chunks: &[ManifestChunk],
) -> ManifestDiff {
let old_hashes: HashSet<_> = old_chunks.iter().map(|c| c.hash).collect();
let new_hashes: HashSet<_> = new_chunks.iter().map(|c| c.hash).collect();
ManifestDiff {
reuse: new_chunks
.iter()
.filter(|c| old_hashes.contains(&c.hash))
.cloned()
.collect(),
fetch: new_chunks
.iter()
.filter(|c| !old_hashes.contains(&c.hash))
.cloned()
.collect(),
orphaned: old_chunks
.iter()
.filter(|c| !new_hashes.contains(&c.hash))
.map(|c| c.hash)
.collect(),
}
}
}
impl Default for DeltaDetector {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, thiserror::Error)]
pub enum DeltaError {
#[error("Origin read error: {0}")]
OriginRead(String),
#[error("Origin scan error: {0}")]
OriginScan(String),
}
#[cfg(test)]
mod tests {
use super::*;
use musicfs_core::{OriginId, RealPath, VirtualPath};
use std::time::SystemTime;
fn make_file_meta(id: i64, path: &str, size: u64) -> FileMeta {
FileMeta {
id: FileId(id),
virtual_path: VirtualPath::new(format!("/test/{}", path)),
real_path: RealPath {
origin_id: OriginId::from("test"),
path: PathBuf::from(path),
},
size,
mtime: SystemTime::UNIX_EPOCH,
content_hash: None,
audio: None,
}
}
fn make_scanned_file(path: &str, size: u64) -> ScannedFile {
ScannedFile {
path: PathBuf::from(path),
origin_id: OriginId::from("test"),
size,
mtime: SystemTime::UNIX_EPOCH,
}
}
#[test]
fn test_is_modified_size_change() {
let detector = DeltaDetector::new();
let cached = make_file_meta(1, "test.flac", 1000);
let scanned = make_scanned_file("test.flac", 2000);
assert!(detector.is_modified_scan(&cached, &scanned));
}
#[test]
fn test_is_modified_same() {
let detector = DeltaDetector::new();
let cached = make_file_meta(1, "test.flac", 1000);
let scanned = make_scanned_file("test.flac", 1000);
assert!(!detector.is_modified_scan(&cached, &scanned));
}
#[test]
fn test_is_audio_file() {
assert!(DeltaDetector::is_audio_file("track.flac"));
assert!(DeltaDetector::is_audio_file("song.MP3"));
assert!(DeltaDetector::is_audio_file("audio.ogg"));
assert!(!DeltaDetector::is_audio_file("readme.txt"));
assert!(!DeltaDetector::is_audio_file("cover.jpg"));
}
#[test]
fn test_compute_diff() {
let detector = DeltaDetector::new();
let old_chunks = vec![
ManifestChunk {
hash: ChunkHash::from_bytes(b"A"),
offset: 0,
size: 256,
},
ManifestChunk {
hash: ChunkHash::from_bytes(b"B"),
offset: 256,
size: 256,
},
ManifestChunk {
hash: ChunkHash::from_bytes(b"C"),
offset: 512,
size: 256,
},
];
let new_chunks = vec![
ManifestChunk {
hash: ChunkHash::from_bytes(b"A"),
offset: 0,
size: 256,
},
ManifestChunk {
hash: ChunkHash::from_bytes(b"D"),
offset: 256,
size: 256,
},
ManifestChunk {
hash: ChunkHash::from_bytes(b"C"),
offset: 512,
size: 256,
},
];
let diff = detector.compute_diff(&old_chunks, &new_chunks);
assert_eq!(diff.reuse.len(), 2);
assert_eq!(diff.fetch.len(), 1);
assert_eq!(diff.orphaned.len(), 1);
}
}
+7
View File
@@ -0,0 +1,7 @@
pub mod cdc;
pub mod delta;
pub mod watcher;
pub use cdc::{CdcChunker, Chunk, ChunkRef};
pub use delta::{ChangeSet, DeltaDetector, DeltaError, ManifestChunk, ManifestDiff};
pub use watcher::{OriginWatcher, WatchError, WatchHandle};
+218
View File
@@ -0,0 +1,218 @@
use musicfs_core::{Event, EventBus, OriginId, VirtualPath};
use notify::{Config, RecommendedWatcher, RecursiveMode, Watcher};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Instant;
use tokio::sync::mpsc;
use tracing::{error, info, info_span, trace, Instrument};
const DEBOUNCE_MS: u64 = 200;
pub struct OriginWatcher {
origin_id: OriginId,
root: PathBuf,
event_bus: Arc<EventBus>,
}
impl OriginWatcher {
pub fn new(origin_id: OriginId, root: PathBuf, event_bus: Arc<EventBus>) -> Self {
Self {
origin_id,
root,
event_bus,
}
}
pub fn start(self) -> WatchHandle {
let (stop_tx, mut stop_rx) = mpsc::channel::<()>(1);
let origin_id = self.origin_id.clone();
let root = self.root.clone();
let event_bus = self.event_bus.clone();
let origin_id_str = origin_id.to_string();
tokio::spawn(
async move {
if let Err(e) = Self::watch_loop(&origin_id, &root, &event_bus, &mut stop_rx).await
{
error!("Watcher error: {}", e);
}
}
.instrument(info_span!("file_watcher", origin_id = %origin_id_str)),
);
WatchHandle { stop_tx }
}
async fn watch_loop(
origin_id: &OriginId,
root: &Path,
event_bus: &EventBus,
stop_rx: &mut mpsc::Receiver<()>,
) -> Result<(), WatchError> {
let (tx, mut rx) = mpsc::channel(100);
let mut watcher = RecommendedWatcher::new(
move |res: Result<notify::Event, notify::Error>| {
if let Ok(event) = res {
let _ = tx.blocking_send(event);
}
},
Config::default(),
)
.map_err(|e| WatchError::Init(e.to_string()))?;
watcher
.watch(root, RecursiveMode::Recursive)
.map_err(|e| WatchError::Watch(e.to_string()))?;
info!(origin_id = %origin_id, path = ?root, "Watcher started");
let mut debouncer: HashMap<PathBuf, Instant> = HashMap::new();
loop {
tokio::select! {
Some(event) = rx.recv() => {
Self::handle_notify_event(origin_id, root, event_bus, event, &mut debouncer);
}
_ = stop_rx.recv() => {
info!(origin_id = %origin_id, "Watcher stopped");
break;
}
}
}
Ok(())
}
fn handle_notify_event(
origin_id: &OriginId,
root: &Path,
event_bus: &EventBus,
event: notify::Event,
debouncer: &mut HashMap<PathBuf, Instant>,
) {
use notify::EventKind;
let now = Instant::now();
for path in event.paths {
let relative = match path.strip_prefix(root) {
Ok(p) => p.to_path_buf(),
Err(_) => continue,
};
if !Self::is_audio_file(&path) {
continue;
}
if let Some(last_seen) = debouncer.get(&relative) {
if now.duration_since(*last_seen).as_millis() < DEBOUNCE_MS as u128 {
trace!(origin_id = %origin_id, path = ?relative, "Debouncing event");
continue;
}
}
debouncer.insert(relative.clone(), now);
let vpath = VirtualPath::new(format!("/{}", relative.display()));
match event.kind {
EventKind::Create(_) => {
trace!(origin_id = %origin_id, path = ?relative, "File created");
event_bus.publish(Event::FileAdded {
path: vpath,
origin_id: origin_id.clone(),
});
}
EventKind::Remove(_) => {
trace!(origin_id = %origin_id, path = ?relative, "File removed");
event_bus.publish(Event::FileRemoved {
path: vpath,
file_id: None,
});
}
EventKind::Modify(_) => {
trace!(origin_id = %origin_id, path = ?relative, "File modified");
event_bus.publish(Event::FileModified { path: vpath });
}
_ => {}
}
}
}
fn is_audio_file(path: &Path) -> bool {
matches!(
path.extension()
.and_then(|e| e.to_str())
.map(|e| e.to_lowercase())
.as_deref(),
Some("flac" | "mp3" | "ogg" | "wav" | "m4a" | "aac" | "opus")
)
}
}
pub struct WatchHandle {
stop_tx: mpsc::Sender<()>,
}
impl WatchHandle {
pub async fn stop(self) {
let _ = self.stop_tx.send(()).await;
}
}
impl Drop for WatchHandle {
fn drop(&mut self) {
trace!("WatchHandle dropped");
let _ = self.stop_tx.try_send(());
}
}
#[derive(Debug, thiserror::Error)]
pub enum WatchError {
#[error("Failed to initialize watcher: {0}")]
Init(String),
#[error("Failed to watch path: {0}")]
Watch(String),
}
#[cfg(test)]
mod tests {
use super::*;
use std::time::Duration;
use tempfile::TempDir;
#[tokio::test]
async fn test_watcher_detects_create() {
let dir = TempDir::new().unwrap();
let event_bus = Arc::new(EventBus::default());
let mut rx = event_bus.subscribe();
let watcher =
OriginWatcher::new(OriginId::from("test"), dir.path().to_path_buf(), event_bus);
let handle = watcher.start();
tokio::time::sleep(Duration::from_millis(100)).await;
std::fs::write(dir.path().join("test.flac"), b"audio").unwrap();
tokio::time::sleep(Duration::from_millis(300)).await;
let event = rx.try_recv();
assert!(matches!(event, Ok(Event::FileAdded { .. })));
handle.stop().await;
}
#[test]
fn test_is_audio_file() {
assert!(OriginWatcher::is_audio_file(Path::new("/music/song.flac")));
assert!(OriginWatcher::is_audio_file(Path::new("/music/song.MP3")));
assert!(!OriginWatcher::is_audio_file(Path::new("/music/cover.jpg")));
assert!(!OriginWatcher::is_audio_file(Path::new(
"/music/readme.txt"
)));
}
}