Files
MusicFS/musicfs/crates/musicfs-sync/src/delta.rs
T
Alexander 32c96701c8 Implement Week 5 CDC & Delta Detection with Oracle fixes
- Add CdcChunker using FastCDC v3 (16KB/64KB/256KB chunks)
- Add DeltaDetector with scan_origin() returning ScannedFile (no FileId assignment)
- Add OriginWatcher with inotify and 200ms debounce using tokio::spawn
- Fix LocalOrigin::read() to loop until all bytes read
- Add read_full() method to Origin trait
- Add mtime field to ChunkManifest
- Update ContentFetcher to use CDC chunking
- Update bandwidth reduction test to assert >90% (NFR-6.4)

Tests: 71 pass (+11 new)
2026-05-12 20:05:44 +02:00

330 lines
9.3 KiB
Rust

use crate::cdc::CdcChunker;
use musicfs_core::{ChunkHash, FileId, FileMeta, OriginId, RealPath, VirtualPath};
use musicfs_origins::Origin;
use std::collections::{HashMap, HashSet};
use std::path::PathBuf;
use std::time::SystemTime;
use tracing::{debug, info};
#[derive(Debug, Clone)]
pub struct ScannedFile {
pub path: PathBuf,
pub origin_id: OriginId,
pub size: u64,
pub mtime: SystemTime,
}
#[derive(Debug, Default)]
pub struct ChangeSet {
pub added: Vec<ScannedFile>,
pub removed: Vec<FileId>,
pub modified: Vec<(FileId, ManifestDiff)>,
}
impl ChangeSet {
pub fn is_empty(&self) -> bool {
self.added.is_empty() && self.removed.is_empty() && self.modified.is_empty()
}
pub fn total_changes(&self) -> usize {
self.added.len() + self.removed.len() + self.modified.len()
}
}
#[derive(Debug, Clone)]
pub struct ManifestChunk {
pub hash: ChunkHash,
pub offset: u64,
pub size: u32,
}
#[derive(Debug)]
pub struct ManifestDiff {
pub reuse: Vec<ManifestChunk>,
pub fetch: Vec<ManifestChunk>,
pub orphaned: Vec<ChunkHash>,
}
pub struct DeltaDetector {
chunker: CdcChunker,
}
impl DeltaDetector {
pub fn new() -> Self {
Self {
chunker: CdcChunker::default(),
}
}
pub fn with_chunker(chunker: CdcChunker) -> Self {
Self { chunker }
}
pub async fn detect_changes(
&self,
origin: &dyn Origin,
cached: &HashMap<FileId, FileMeta>,
manifests: &HashMap<FileId, Vec<ManifestChunk>>,
) -> Result<ChangeSet, DeltaError> {
let mut changes = ChangeSet::default();
let origin_files = self.scan_origin(origin).await?;
let cached_by_path: HashMap<_, _> = cached
.values()
.map(|m| (m.real_path.path.clone(), m))
.collect();
for scanned in &origin_files {
if let Some(cached_file) = cached_by_path.get(&scanned.path) {
if self.is_modified_scan(cached_file, scanned) {
debug!("File modified: {:?}", scanned.path);
if let Some(old_chunks) = manifests.get(&cached_file.id) {
let new_chunks = self.compute_chunks_for_scan(origin, scanned).await?;
let diff = self.compute_diff(old_chunks, &new_chunks);
changes.modified.push((cached_file.id, diff));
}
}
} else {
debug!("File added: {:?}", scanned.path);
changes.added.push(scanned.clone());
}
}
let origin_paths: HashSet<_> = origin_files.iter().map(|f| &f.path).collect();
for cached_file in cached.values() {
if !origin_paths.contains(&cached_file.real_path.path) {
debug!("File removed: {:?}", cached_file.real_path.path);
changes.removed.push(cached_file.id);
}
}
info!(
"Delta detection complete: {} added, {} removed, {} modified",
changes.added.len(),
changes.removed.len(),
changes.modified.len()
);
Ok(changes)
}
fn is_modified_scan(&self, cached: &FileMeta, scanned: &ScannedFile) -> bool {
cached.size != scanned.size || cached.mtime != scanned.mtime
}
async fn scan_origin(&self, origin: &dyn Origin) -> Result<Vec<ScannedFile>, DeltaError> {
let mut files = Vec::new();
let mut dirs_to_scan = vec![PathBuf::from("/")];
while let Some(dir) = dirs_to_scan.pop() {
let entries = origin
.readdir(&dir)
.await
.map_err(|e| DeltaError::OriginScan(e.to_string()))?;
for entry in entries {
let entry_path = dir.join(&entry.name);
if entry.is_dir {
dirs_to_scan.push(entry_path);
} else if Self::is_audio_file(&entry.name) {
let stat = origin
.stat(&entry_path)
.await
.map_err(|e| DeltaError::OriginScan(e.to_string()))?;
files.push(ScannedFile {
path: entry_path,
origin_id: origin.id().clone(),
size: stat.size,
mtime: stat.mtime,
});
}
}
}
Ok(files)
}
fn is_audio_file(name: &str) -> bool {
let lower = name.to_lowercase();
lower.ends_with(".flac")
|| lower.ends_with(".mp3")
|| lower.ends_with(".ogg")
|| lower.ends_with(".wav")
|| lower.ends_with(".m4a")
|| lower.ends_with(".aac")
|| lower.ends_with(".opus")
}
async fn compute_chunks_for_scan(
&self,
origin: &dyn Origin,
scanned: &ScannedFile,
) -> Result<Vec<ManifestChunk>, DeltaError> {
let data = origin
.read_full(&scanned.path)
.await
.map_err(|e| DeltaError::OriginRead(e.to_string()))?;
let chunks = self.chunker.chunk_refs(&data);
Ok(chunks
.into_iter()
.map(|c| ManifestChunk {
hash: c.hash,
offset: c.offset,
size: c.length,
})
.collect())
}
fn compute_diff(&self, old_chunks: &[ManifestChunk], new_chunks: &[ManifestChunk]) -> ManifestDiff {
let old_hashes: HashSet<_> = old_chunks.iter().map(|c| c.hash).collect();
let new_hashes: HashSet<_> = new_chunks.iter().map(|c| c.hash).collect();
ManifestDiff {
reuse: new_chunks
.iter()
.filter(|c| old_hashes.contains(&c.hash))
.cloned()
.collect(),
fetch: new_chunks
.iter()
.filter(|c| !old_hashes.contains(&c.hash))
.cloned()
.collect(),
orphaned: old_chunks
.iter()
.filter(|c| !new_hashes.contains(&c.hash))
.map(|c| c.hash)
.collect(),
}
}
}
impl Default for DeltaDetector {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, thiserror::Error)]
pub enum DeltaError {
#[error("Origin read error: {0}")]
OriginRead(String),
#[error("Origin scan error: {0}")]
OriginScan(String),
}
#[cfg(test)]
mod tests {
use super::*;
use musicfs_core::OriginId;
use std::time::SystemTime;
fn make_file_meta(id: i64, path: &str, size: u64) -> FileMeta {
FileMeta {
id: FileId(id),
virtual_path: VirtualPath::new(format!("/test/{}", path)),
real_path: RealPath {
origin_id: OriginId::from("test"),
path: PathBuf::from(path),
},
size,
mtime: SystemTime::UNIX_EPOCH,
content_hash: None,
audio: None,
}
}
fn make_scanned_file(path: &str, size: u64) -> ScannedFile {
ScannedFile {
path: PathBuf::from(path),
origin_id: OriginId::from("test"),
size,
mtime: SystemTime::UNIX_EPOCH,
}
}
#[test]
fn test_is_modified_size_change() {
let detector = DeltaDetector::new();
let cached = make_file_meta(1, "test.flac", 1000);
let scanned = make_scanned_file("test.flac", 2000);
assert!(detector.is_modified_scan(&cached, &scanned));
}
#[test]
fn test_is_modified_same() {
let detector = DeltaDetector::new();
let cached = make_file_meta(1, "test.flac", 1000);
let scanned = make_scanned_file("test.flac", 1000);
assert!(!detector.is_modified_scan(&cached, &scanned));
}
#[test]
fn test_is_audio_file() {
assert!(DeltaDetector::is_audio_file("track.flac"));
assert!(DeltaDetector::is_audio_file("song.MP3"));
assert!(DeltaDetector::is_audio_file("audio.ogg"));
assert!(!DeltaDetector::is_audio_file("readme.txt"));
assert!(!DeltaDetector::is_audio_file("cover.jpg"));
}
#[test]
fn test_compute_diff() {
let detector = DeltaDetector::new();
let old_chunks = vec![
ManifestChunk {
hash: ChunkHash::from_bytes(b"A"),
offset: 0,
size: 256,
},
ManifestChunk {
hash: ChunkHash::from_bytes(b"B"),
offset: 256,
size: 256,
},
ManifestChunk {
hash: ChunkHash::from_bytes(b"C"),
offset: 512,
size: 256,
},
];
let new_chunks = vec![
ManifestChunk {
hash: ChunkHash::from_bytes(b"A"),
offset: 0,
size: 256,
},
ManifestChunk {
hash: ChunkHash::from_bytes(b"D"),
offset: 256,
size: 256,
},
ManifestChunk {
hash: ChunkHash::from_bytes(b"C"),
offset: 512,
size: 256,
},
];
let diff = detector.compute_diff(&old_chunks, &new_chunks);
assert_eq!(diff.reuse.len(), 2);
assert_eq!(diff.fetch.len(), 1);
assert_eq!(diff.orphaned.len(), 1);
}
}