Implement Week 5 CDC & Delta Detection with Oracle fixes
- Add CdcChunker using FastCDC v3 (16KB/64KB/256KB chunks) - Add DeltaDetector with scan_origin() returning ScannedFile (no FileId assignment) - Add OriginWatcher with inotify and 200ms debounce using tokio::spawn - Fix LocalOrigin::read() to loop until all bytes read - Add read_full() method to Origin trait - Add mtime field to ChunkManifest - Update ContentFetcher to use CDC chunking - Update bandwidth reduction test to assert >90% (NFR-6.4) Tests: 71 pass (+11 new)
This commit is contained in:
@@ -0,0 +1,329 @@
|
||||
use crate::cdc::CdcChunker;
|
||||
use musicfs_core::{ChunkHash, FileId, FileMeta, OriginId, RealPath, VirtualPath};
|
||||
use musicfs_origins::Origin;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::path::PathBuf;
|
||||
use std::time::SystemTime;
|
||||
use tracing::{debug, info};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ScannedFile {
|
||||
pub path: PathBuf,
|
||||
pub origin_id: OriginId,
|
||||
pub size: u64,
|
||||
pub mtime: SystemTime,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct ChangeSet {
|
||||
pub added: Vec<ScannedFile>,
|
||||
pub removed: Vec<FileId>,
|
||||
pub modified: Vec<(FileId, ManifestDiff)>,
|
||||
}
|
||||
|
||||
impl ChangeSet {
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.added.is_empty() && self.removed.is_empty() && self.modified.is_empty()
|
||||
}
|
||||
|
||||
pub fn total_changes(&self) -> usize {
|
||||
self.added.len() + self.removed.len() + self.modified.len()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ManifestChunk {
|
||||
pub hash: ChunkHash,
|
||||
pub offset: u64,
|
||||
pub size: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ManifestDiff {
|
||||
pub reuse: Vec<ManifestChunk>,
|
||||
pub fetch: Vec<ManifestChunk>,
|
||||
pub orphaned: Vec<ChunkHash>,
|
||||
}
|
||||
|
||||
pub struct DeltaDetector {
|
||||
chunker: CdcChunker,
|
||||
}
|
||||
|
||||
impl DeltaDetector {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
chunker: CdcChunker::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_chunker(chunker: CdcChunker) -> Self {
|
||||
Self { chunker }
|
||||
}
|
||||
|
||||
pub async fn detect_changes(
|
||||
&self,
|
||||
origin: &dyn Origin,
|
||||
cached: &HashMap<FileId, FileMeta>,
|
||||
manifests: &HashMap<FileId, Vec<ManifestChunk>>,
|
||||
) -> Result<ChangeSet, DeltaError> {
|
||||
let mut changes = ChangeSet::default();
|
||||
|
||||
let origin_files = self.scan_origin(origin).await?;
|
||||
|
||||
let cached_by_path: HashMap<_, _> = cached
|
||||
.values()
|
||||
.map(|m| (m.real_path.path.clone(), m))
|
||||
.collect();
|
||||
|
||||
for scanned in &origin_files {
|
||||
if let Some(cached_file) = cached_by_path.get(&scanned.path) {
|
||||
if self.is_modified_scan(cached_file, scanned) {
|
||||
debug!("File modified: {:?}", scanned.path);
|
||||
|
||||
if let Some(old_chunks) = manifests.get(&cached_file.id) {
|
||||
let new_chunks = self.compute_chunks_for_scan(origin, scanned).await?;
|
||||
let diff = self.compute_diff(old_chunks, &new_chunks);
|
||||
changes.modified.push((cached_file.id, diff));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
debug!("File added: {:?}", scanned.path);
|
||||
changes.added.push(scanned.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let origin_paths: HashSet<_> = origin_files.iter().map(|f| &f.path).collect();
|
||||
|
||||
for cached_file in cached.values() {
|
||||
if !origin_paths.contains(&cached_file.real_path.path) {
|
||||
debug!("File removed: {:?}", cached_file.real_path.path);
|
||||
changes.removed.push(cached_file.id);
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"Delta detection complete: {} added, {} removed, {} modified",
|
||||
changes.added.len(),
|
||||
changes.removed.len(),
|
||||
changes.modified.len()
|
||||
);
|
||||
|
||||
Ok(changes)
|
||||
}
|
||||
|
||||
fn is_modified_scan(&self, cached: &FileMeta, scanned: &ScannedFile) -> bool {
|
||||
cached.size != scanned.size || cached.mtime != scanned.mtime
|
||||
}
|
||||
|
||||
async fn scan_origin(&self, origin: &dyn Origin) -> Result<Vec<ScannedFile>, DeltaError> {
|
||||
let mut files = Vec::new();
|
||||
let mut dirs_to_scan = vec![PathBuf::from("/")];
|
||||
|
||||
while let Some(dir) = dirs_to_scan.pop() {
|
||||
let entries = origin
|
||||
.readdir(&dir)
|
||||
.await
|
||||
.map_err(|e| DeltaError::OriginScan(e.to_string()))?;
|
||||
|
||||
for entry in entries {
|
||||
let entry_path = dir.join(&entry.name);
|
||||
|
||||
if entry.is_dir {
|
||||
dirs_to_scan.push(entry_path);
|
||||
} else if Self::is_audio_file(&entry.name) {
|
||||
let stat = origin
|
||||
.stat(&entry_path)
|
||||
.await
|
||||
.map_err(|e| DeltaError::OriginScan(e.to_string()))?;
|
||||
|
||||
files.push(ScannedFile {
|
||||
path: entry_path,
|
||||
origin_id: origin.id().clone(),
|
||||
size: stat.size,
|
||||
mtime: stat.mtime,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
fn is_audio_file(name: &str) -> bool {
|
||||
let lower = name.to_lowercase();
|
||||
lower.ends_with(".flac")
|
||||
|| lower.ends_with(".mp3")
|
||||
|| lower.ends_with(".ogg")
|
||||
|| lower.ends_with(".wav")
|
||||
|| lower.ends_with(".m4a")
|
||||
|| lower.ends_with(".aac")
|
||||
|| lower.ends_with(".opus")
|
||||
}
|
||||
|
||||
async fn compute_chunks_for_scan(
|
||||
&self,
|
||||
origin: &dyn Origin,
|
||||
scanned: &ScannedFile,
|
||||
) -> Result<Vec<ManifestChunk>, DeltaError> {
|
||||
let data = origin
|
||||
.read_full(&scanned.path)
|
||||
.await
|
||||
.map_err(|e| DeltaError::OriginRead(e.to_string()))?;
|
||||
|
||||
let chunks = self.chunker.chunk_refs(&data);
|
||||
|
||||
Ok(chunks
|
||||
.into_iter()
|
||||
.map(|c| ManifestChunk {
|
||||
hash: c.hash,
|
||||
offset: c.offset,
|
||||
size: c.length,
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
fn compute_diff(&self, old_chunks: &[ManifestChunk], new_chunks: &[ManifestChunk]) -> ManifestDiff {
|
||||
let old_hashes: HashSet<_> = old_chunks.iter().map(|c| c.hash).collect();
|
||||
let new_hashes: HashSet<_> = new_chunks.iter().map(|c| c.hash).collect();
|
||||
|
||||
ManifestDiff {
|
||||
reuse: new_chunks
|
||||
.iter()
|
||||
.filter(|c| old_hashes.contains(&c.hash))
|
||||
.cloned()
|
||||
.collect(),
|
||||
fetch: new_chunks
|
||||
.iter()
|
||||
.filter(|c| !old_hashes.contains(&c.hash))
|
||||
.cloned()
|
||||
.collect(),
|
||||
orphaned: old_chunks
|
||||
.iter()
|
||||
.filter(|c| !new_hashes.contains(&c.hash))
|
||||
.map(|c| c.hash)
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for DeltaDetector {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DeltaError {
|
||||
#[error("Origin read error: {0}")]
|
||||
OriginRead(String),
|
||||
|
||||
#[error("Origin scan error: {0}")]
|
||||
OriginScan(String),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use musicfs_core::OriginId;
|
||||
use std::time::SystemTime;
|
||||
|
||||
fn make_file_meta(id: i64, path: &str, size: u64) -> FileMeta {
|
||||
FileMeta {
|
||||
id: FileId(id),
|
||||
virtual_path: VirtualPath::new(format!("/test/{}", path)),
|
||||
real_path: RealPath {
|
||||
origin_id: OriginId::from("test"),
|
||||
path: PathBuf::from(path),
|
||||
},
|
||||
size,
|
||||
mtime: SystemTime::UNIX_EPOCH,
|
||||
content_hash: None,
|
||||
audio: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn make_scanned_file(path: &str, size: u64) -> ScannedFile {
|
||||
ScannedFile {
|
||||
path: PathBuf::from(path),
|
||||
origin_id: OriginId::from("test"),
|
||||
size,
|
||||
mtime: SystemTime::UNIX_EPOCH,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_modified_size_change() {
|
||||
let detector = DeltaDetector::new();
|
||||
|
||||
let cached = make_file_meta(1, "test.flac", 1000);
|
||||
let scanned = make_scanned_file("test.flac", 2000);
|
||||
|
||||
assert!(detector.is_modified_scan(&cached, &scanned));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_modified_same() {
|
||||
let detector = DeltaDetector::new();
|
||||
|
||||
let cached = make_file_meta(1, "test.flac", 1000);
|
||||
let scanned = make_scanned_file("test.flac", 1000);
|
||||
|
||||
assert!(!detector.is_modified_scan(&cached, &scanned));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_audio_file() {
|
||||
assert!(DeltaDetector::is_audio_file("track.flac"));
|
||||
assert!(DeltaDetector::is_audio_file("song.MP3"));
|
||||
assert!(DeltaDetector::is_audio_file("audio.ogg"));
|
||||
assert!(!DeltaDetector::is_audio_file("readme.txt"));
|
||||
assert!(!DeltaDetector::is_audio_file("cover.jpg"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_diff() {
|
||||
let detector = DeltaDetector::new();
|
||||
|
||||
let old_chunks = vec![
|
||||
ManifestChunk {
|
||||
hash: ChunkHash::from_bytes(b"A"),
|
||||
offset: 0,
|
||||
size: 256,
|
||||
},
|
||||
ManifestChunk {
|
||||
hash: ChunkHash::from_bytes(b"B"),
|
||||
offset: 256,
|
||||
size: 256,
|
||||
},
|
||||
ManifestChunk {
|
||||
hash: ChunkHash::from_bytes(b"C"),
|
||||
offset: 512,
|
||||
size: 256,
|
||||
},
|
||||
];
|
||||
|
||||
let new_chunks = vec![
|
||||
ManifestChunk {
|
||||
hash: ChunkHash::from_bytes(b"A"),
|
||||
offset: 0,
|
||||
size: 256,
|
||||
},
|
||||
ManifestChunk {
|
||||
hash: ChunkHash::from_bytes(b"D"),
|
||||
offset: 256,
|
||||
size: 256,
|
||||
},
|
||||
ManifestChunk {
|
||||
hash: ChunkHash::from_bytes(b"C"),
|
||||
offset: 512,
|
||||
size: 256,
|
||||
},
|
||||
];
|
||||
|
||||
let diff = detector.compute_diff(&old_chunks, &new_chunks);
|
||||
|
||||
assert_eq!(diff.reuse.len(), 2);
|
||||
assert_eq!(diff.fetch.len(), 1);
|
||||
assert_eq!(diff.orphaned.len(), 1);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user