From 128a6e079e23bbed4d90686eacd61bd3c43ae8ff Mon Sep 17 00:00:00 2001 From: Alexander Date: Sun, 17 May 2026 17:14:23 +0200 Subject: [PATCH] feat(cache): implement Id3v2Handler for MP3 metadata synthesis - Implement all 8 FormatHandler trait methods - Use lofty 0.24 for ID3v2.4 tag creation/parsing - Map all 36 AudioMeta fields to ID3v2 frames - Handle ID3v2 header parsing for audio_start - Detect ID3v1 tags at EOF for audio_end - Add 13 comprehensive unit tests - Fix test-utils AudioMeta construction with ..Default::default() - All tests pass, LSP diagnostics clean --- Cargo.lock | 42 + crates/musicfs-cache/Cargo.toml | 1 + crates/musicfs-cache/src/db.rs | 1 + crates/musicfs-cache/src/format_handler.rs | 103 ++ crates/musicfs-cache/src/format_layout.rs | 22 + crates/musicfs-cache/src/handlers/id3v2.rs | 618 ++++++++++++ crates/musicfs-cache/src/handlers/mod.rs | 10 + crates/musicfs-cache/src/lib.rs | 4 + crates/musicfs-cache/src/schema.sql | 31 + crates/musicfs-core/src/types.rs | 24 + crates/musicfs-test-utils/src/fixtures.rs | 1 + docs/v2/features/metadata-overlay.md | 1010 ++++++++++++++++++++ 12 files changed, 1867 insertions(+) create mode 100644 crates/musicfs-cache/src/format_handler.rs create mode 100644 crates/musicfs-cache/src/format_layout.rs create mode 100644 crates/musicfs-cache/src/handlers/id3v2.rs create mode 100644 crates/musicfs-cache/src/handlers/mod.rs create mode 100644 docs/v2/features/metadata-overlay.md diff --git a/Cargo.lock b/Cargo.lock index 05dccf1..cba7e9d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -629,6 +629,12 @@ dependencies = [ "parking_lot_core 0.9.12", ] +[[package]] +name = "data-encoding" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4ae5f15dda3c708c0ade84bfee31ccab44a3da4f88015ed22f63732abe300c8" + [[package]] name = "debugid" version = "0.8.0" @@ -1691,6 +1697,32 @@ dependencies = [ "scopeguard", ] +[[package]] +name = "lofty" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec4feeff6c7d75093278133a06e827d7af6d2bfe20b0f331f9d10338a5ec7ca" +dependencies = [ + "byteorder", + "data-encoding", + "flate2", + "lofty_attr", + "log", + "ogg_pager", + "paste", +] + +[[package]] +name = "lofty_attr" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "458ace39169e4b83c4f77ae3d42d5d1d11c422feef590219a97c973d3b524557" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "log" version = "0.4.29" @@ -1885,6 +1917,7 @@ version = "0.1.0" dependencies = [ "chrono", "image", + "lofty", "musicfs-cas", "musicfs-core", "musicfs-metadata", @@ -2256,6 +2289,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "ogg_pager" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d36b1d6964c3ac92b7aea701057e02b6b91143d70d83b20abf75a231a3c0216" +dependencies = [ + "byteorder", +] + [[package]] name = "once_cell" version = "1.21.4" diff --git a/crates/musicfs-cache/Cargo.toml b/crates/musicfs-cache/Cargo.toml index 760af18..d6311c8 100644 --- a/crates/musicfs-cache/Cargo.toml +++ b/crates/musicfs-cache/Cargo.toml @@ -15,6 +15,7 @@ thiserror.workspace = true serde.workspace = true rmp-serde.workspace = true image.workspace = true +lofty = "0.24" parking_lot.workspace = true chrono.workspace = true diff --git a/crates/musicfs-cache/src/db.rs b/crates/musicfs-cache/src/db.rs index 5f49b57..e745ff1 100644 --- a/crates/musicfs-cache/src/db.rs +++ b/crates/musicfs-cache/src/db.rs @@ -203,6 +203,7 @@ impl Database { bitrate: row.get(13)?, sample_rate: row.get(14)?, format, + ..Default::default() }), size: row.get::<_, i64>(17)? as u64, mtime: UNIX_EPOCH + Duration::from_secs(row.get::<_, i64>(16)? as u64), diff --git a/crates/musicfs-cache/src/format_handler.rs b/crates/musicfs-cache/src/format_handler.rs new file mode 100644 index 0000000..9e8806f --- /dev/null +++ b/crates/musicfs-cache/src/format_handler.rs @@ -0,0 +1,103 @@ +use crate::FormatLayout; +use musicfs_core::AudioMeta; +use std::collections::HashMap; +use std::sync::Arc; + +/// Error types for format handling operations +#[derive(Debug, thiserror::Error)] +pub enum FormatError { + #[error("Unsupported format")] + UnsupportedFormat, + + #[error("Invalid data: {0}")] + InvalidData(String), + + #[error("Synthesis failed: {0}")] + SynthesisFailed(String), +} + +/// Trait for format-specific metadata handling. +/// +/// Implementations handle: +/// 1. Analyzing original files to find audio boundaries +/// 2. Synthesizing new headers from database metadata +pub trait FormatHandler: Send + Sync + 'static { + /// Unique identifier for this handler + fn id(&self) -> &'static str; + + /// Human-readable name + fn name(&self) -> &'static str; + + /// File extensions this handler supports + fn extensions(&self) -> &[&'static str]; + + /// MIME types this handler supports + fn mime_types(&self) -> &[&'static str]; + + /// Analyze file bytes to determine audio layout + fn analyze( + &self, + data: &[u8], + file_size: u64, + ) -> std::result::Result; + + /// Synthesize header bytes from metadata. Called on every read(). + fn synthesize( + &self, + metadata: &AudioMeta, + layout: &FormatLayout, + ) -> std::result::Result, FormatError>; + + /// Extract metadata from header bytes (for initial ingest) + fn extract(&self, data: &[u8]) -> std::result::Result; + + /// Estimate header size without full synthesis (for getattr) + fn estimate_header_size(&self, _metadata: &AudioMeta) -> usize { + 10 * 1024 // 10KB default + } +} + +/// Registry for format handlers +pub struct FormatHandlerRegistry { + handlers: HashMap>, + extension_map: HashMap, +} + +impl FormatHandlerRegistry { + /// Create empty registry + pub fn new() -> Self { + Self { + handlers: HashMap::new(), + extension_map: HashMap::new(), + } + } + + /// Register a format handler + pub fn register(&mut self, handler: Arc) { + let id = handler.id().to_string(); + + // Map extensions to handler ID + for ext in handler.extensions() { + self.extension_map.insert(ext.to_string(), id.clone()); + } + + self.handlers.insert(id, handler); + } + + /// Get handler by file extension + pub fn get_by_extension(&self, ext: &str) -> Option> { + let id = self.extension_map.get(ext)?; + self.handlers.get(id).cloned() + } + + /// Get handler by format ID + pub fn get_by_format(&self, format: &str) -> Option> { + self.handlers.get(format).cloned() + } +} + +impl Default for FormatHandlerRegistry { + fn default() -> Self { + Self::new() + } +} diff --git a/crates/musicfs-cache/src/format_layout.rs b/crates/musicfs-cache/src/format_layout.rs new file mode 100644 index 0000000..c21cfb6 --- /dev/null +++ b/crates/musicfs-cache/src/format_layout.rs @@ -0,0 +1,22 @@ +use musicfs_core::AudioFormat; +use serde::{Deserialize, Serialize}; + +/// Describes the byte layout of an audio file for overlay splicing. +/// +/// This struct tracks where the audio data begins and ends in the origin file, +/// allowing the OverlayReader to splice synthetic headers with original audio. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FormatLayout { + /// Byte offset where audio data begins in the origin file + pub audio_start: u64, + + /// Byte offset where audio data ends in the origin file + pub audio_end: u64, + + /// Audio format (from musicfs-core) + pub format: AudioFormat, + + /// Format-specific data (e.g., FLAC STREAMINFO block, MP4 stco offsets) + /// Stored as raw bytes, interpreted by format handlers + pub format_data: Option>, +} diff --git a/crates/musicfs-cache/src/handlers/id3v2.rs b/crates/musicfs-cache/src/handlers/id3v2.rs new file mode 100644 index 0000000..df835f8 --- /dev/null +++ b/crates/musicfs-cache/src/handlers/id3v2.rs @@ -0,0 +1,618 @@ +use crate::{FormatError, FormatHandler, FormatLayout}; +use lofty::config::{ParseOptions, WriteOptions}; +use lofty::file::AudioFile; +use lofty::id3::v2::{CommentFrame, Frame, FrameId, Id3v2Tag, TextInformationFrame, UnsynchronizedTextFrame}; +use lofty::mpeg::MpegFile; +use lofty::tag::{Accessor, TagExt}; +use lofty::TextEncoding; +use musicfs_core::{AudioFormat, AudioMeta}; +use std::borrow::Cow; +use std::io::Cursor; + +const ID3V2_HEADER_SIZE: usize = 10; +const ID3V1_TAG_SIZE: usize = 128; + +pub struct Id3v2Handler; + +impl Id3v2Handler { + pub fn new() -> Self { + Self + } + + fn parse_id3v2_header(data: &[u8]) -> Option { + if data.len() < ID3V2_HEADER_SIZE { + return None; + } + + if &data[0..3] != b"ID3" { + return None; + } + + let size = syncsafe_decode(&data[6..10]); + Some(ID3V2_HEADER_SIZE + size) + } + + fn has_id3v1_tag(data: &[u8], file_size: u64) -> bool { + if file_size < ID3V1_TAG_SIZE as u64 { + return false; + } + + let tag_start = (file_size as usize).saturating_sub(ID3V1_TAG_SIZE); + if tag_start >= data.len() { + return false; + } + + &data[tag_start..tag_start + 3] == b"TAG" + } + + fn set_text_frame(tag: &mut Id3v2Tag, frame_id: &'static str, value: &str) { + let id = FrameId::Valid(Cow::Borrowed(frame_id)); + let frame = Frame::Text(TextInformationFrame::new( + id, + TextEncoding::UTF8, + value.to_string(), + )); + tag.insert(frame); + } + + fn set_track_disc_frame(tag: &mut Id3v2Tag, frame_id: &'static str, num: u32, total: Option) { + let value = match total { + Some(t) => format!("{}/{}", num, t), + None => num.to_string(), + }; + Self::set_text_frame(tag, frame_id, &value); + } + + fn set_comment_frame(tag: &mut Id3v2Tag, value: &str) { + let frame = Frame::Comment(CommentFrame::new( + TextEncoding::UTF8, + *b"eng", + String::new(), + value.to_string(), + )); + tag.insert(frame); + } + + fn set_lyrics_frame(tag: &mut Id3v2Tag, value: &str) { + let frame = Frame::UnsynchronizedText(UnsynchronizedTextFrame::new( + TextEncoding::UTF8, + *b"eng", + String::new(), + value.to_string(), + )); + tag.insert(frame); + } + + fn build_tag_from_meta(metadata: &AudioMeta) -> Id3v2Tag { + let mut tag = Id3v2Tag::new(); + + if let Some(ref title) = metadata.title { + tag.set_title(title.clone()); + } + if let Some(ref artist) = metadata.artist { + tag.set_artist(artist.clone()); + } + if let Some(ref album) = metadata.album { + tag.set_album(album.clone()); + } + if let Some(ref album_artist) = metadata.album_artist { + Self::set_text_frame(&mut tag, "TPE2", album_artist); + } + if let Some(year) = metadata.year { + Self::set_text_frame(&mut tag, "TDRC", &year.to_string()); + } + if let Some(ref genre) = metadata.genre { + tag.set_genre(genre.clone()); + } + + if let Some(track) = metadata.track { + Self::set_track_disc_frame(&mut tag, "TRCK", track, metadata.track_total); + } + if let Some(disc) = metadata.disc { + Self::set_track_disc_frame(&mut tag, "TPOS", disc, metadata.disc_total); + } + + if let Some(ref date) = metadata.date { + Self::set_text_frame(&mut tag, "TDRC", date); + } + if let Some(ref composer) = metadata.composer { + Self::set_text_frame(&mut tag, "TCOM", composer); + } + if let Some(ref comment) = metadata.comment { + Self::set_comment_frame(&mut tag, comment); + } + if let Some(ref lyrics) = metadata.lyrics { + Self::set_lyrics_frame(&mut tag, lyrics); + } + if let Some(ref copyright) = metadata.copyright { + Self::set_text_frame(&mut tag, "TCOP", copyright); + } + if let Some(compilation) = metadata.compilation { + Self::set_text_frame(&mut tag, "TCMP", if compilation { "1" } else { "0" }); + } + + if let Some(ref title_sort) = metadata.title_sort { + Self::set_text_frame(&mut tag, "TSOT", title_sort); + } + if let Some(ref artist_sort) = metadata.artist_sort { + Self::set_text_frame(&mut tag, "TSOP", artist_sort); + } + if let Some(ref album_sort) = metadata.album_sort { + Self::set_text_frame(&mut tag, "TSOA", album_sort); + } + if let Some(ref album_artist_sort) = metadata.album_artist_sort { + Self::set_text_frame(&mut tag, "TSO2", album_artist_sort); + } + + if let Some(ref mb_recording_id) = metadata.mb_recording_id { + tag.insert_user_text("MusicBrainz Recording Id".to_string(), mb_recording_id.clone()); + } + if let Some(ref mb_album_id) = metadata.mb_album_id { + tag.insert_user_text("MusicBrainz Album Id".to_string(), mb_album_id.clone()); + } + if let Some(ref mb_artist_id) = metadata.mb_artist_id { + tag.insert_user_text("MusicBrainz Artist Id".to_string(), mb_artist_id.clone()); + } + if let Some(ref mb_album_artist_id) = metadata.mb_album_artist_id { + tag.insert_user_text( + "MusicBrainz Album Artist Id".to_string(), + mb_album_artist_id.clone(), + ); + } + if let Some(ref mb_release_group_id) = metadata.mb_release_group_id { + tag.insert_user_text( + "MusicBrainz Release Group Id".to_string(), + mb_release_group_id.clone(), + ); + } + + if let Some(gain) = metadata.replaygain_track_gain { + tag.insert_user_text( + "REPLAYGAIN_TRACK_GAIN".to_string(), + format!("{:.2} dB", gain), + ); + } + if let Some(peak) = metadata.replaygain_track_peak { + tag.insert_user_text("REPLAYGAIN_TRACK_PEAK".to_string(), format!("{:.6}", peak)); + } + if let Some(gain) = metadata.replaygain_album_gain { + tag.insert_user_text( + "REPLAYGAIN_ALBUM_GAIN".to_string(), + format!("{:.2} dB", gain), + ); + } + if let Some(peak) = metadata.replaygain_album_peak { + tag.insert_user_text("REPLAYGAIN_ALBUM_PEAK".to_string(), format!("{:.6}", peak)); + } + + if let Some(ref encoder) = metadata.encoder { + Self::set_text_frame(&mut tag, "TSSE", encoder); + } + + tag + } + + fn extract_text_frame(tag: &Id3v2Tag, frame_id: &str) -> Option { + let id = FrameId::new(frame_id).ok()?; + tag.get_text(&id).map(|s| s.to_string()) + } + + fn parse_track_disc(value: &str) -> (Option, Option) { + let parts: Vec<&str> = value.split('/').collect(); + let num = parts.first().and_then(|s| s.parse().ok()); + let total = parts.get(1).and_then(|s| s.parse().ok()); + (num, total) + } + + fn parse_replaygain_value(value: &str) -> Option { + value + .trim() + .trim_end_matches(" dB") + .trim_end_matches("dB") + .parse() + .ok() + } + + fn extract_from_tag(tag: &Id3v2Tag) -> AudioMeta { + let mut meta = AudioMeta::default(); + meta.format = AudioFormat::Mp3; + + meta.title = tag.title().map(|c: Cow<'_, str>| c.into_owned()); + meta.artist = tag.artist().map(|c: Cow<'_, str>| c.into_owned()); + meta.album = tag.album().map(|c: Cow<'_, str>| c.into_owned()); + meta.album_artist = Self::extract_text_frame(tag, "TPE2"); + meta.genre = tag.genre().map(|c: Cow<'_, str>| c.into_owned()); + + if let Some(track_str) = Self::extract_text_frame(tag, "TRCK") { + let (track, track_total) = Self::parse_track_disc(&track_str); + meta.track = track; + meta.track_total = track_total; + } else { + meta.track = tag.track(); + meta.track_total = tag.track_total(); + } + + if let Some(disc_str) = Self::extract_text_frame(tag, "TPOS") { + let (disc, disc_total) = Self::parse_track_disc(&disc_str); + meta.disc = disc; + meta.disc_total = disc_total; + } else { + meta.disc = tag.disk(); + meta.disc_total = tag.disk_total(); + } + + meta.date = Self::extract_text_frame(tag, "TDRC"); + if let Some(ref date) = meta.date { + if let Some(year_str) = date.split('-').next() { + meta.year = year_str.parse().ok(); + } + } + + meta.composer = Self::extract_text_frame(tag, "TCOM"); + meta.comment = tag.comment().map(|c: Cow<'_, str>| c.into_owned()); + + if let Some(uslt) = tag.unsync_text().next() { + meta.lyrics = Some(uslt.content.to_string()); + } + + meta.copyright = Self::extract_text_frame(tag, "TCOP"); + + if let Some(tcmp) = Self::extract_text_frame(tag, "TCMP") { + meta.compilation = Some(tcmp == "1"); + } + + meta.title_sort = Self::extract_text_frame(tag, "TSOT"); + meta.artist_sort = Self::extract_text_frame(tag, "TSOP"); + meta.album_sort = Self::extract_text_frame(tag, "TSOA"); + meta.album_artist_sort = Self::extract_text_frame(tag, "TSO2"); + + meta.mb_recording_id = tag.get_user_text("MusicBrainz Recording Id").map(String::from); + meta.mb_album_id = tag.get_user_text("MusicBrainz Album Id").map(String::from); + meta.mb_artist_id = tag.get_user_text("MusicBrainz Artist Id").map(String::from); + meta.mb_album_artist_id = tag + .get_user_text("MusicBrainz Album Artist Id") + .map(String::from); + meta.mb_release_group_id = tag + .get_user_text("MusicBrainz Release Group Id") + .map(String::from); + + if let Some(gain_str) = tag.get_user_text("REPLAYGAIN_TRACK_GAIN") { + meta.replaygain_track_gain = Self::parse_replaygain_value(gain_str); + } + if let Some(peak_str) = tag.get_user_text("REPLAYGAIN_TRACK_PEAK") { + meta.replaygain_track_peak = peak_str.parse::().ok(); + } + if let Some(gain_str) = tag.get_user_text("REPLAYGAIN_ALBUM_GAIN") { + meta.replaygain_album_gain = Self::parse_replaygain_value(gain_str); + } + if let Some(peak_str) = tag.get_user_text("REPLAYGAIN_ALBUM_PEAK") { + meta.replaygain_album_peak = peak_str.parse::().ok(); + } + + meta.encoder = Self::extract_text_frame(tag, "TSSE"); + + meta + } +} + +impl Default for Id3v2Handler { + fn default() -> Self { + Self::new() + } +} + +impl FormatHandler for Id3v2Handler { + fn id(&self) -> &'static str { + "id3v2" + } + + fn name(&self) -> &'static str { + "ID3v2 (MP3)" + } + + fn extensions(&self) -> &[&'static str] { + &["mp3"] + } + + fn mime_types(&self) -> &[&'static str] { + &["audio/mpeg"] + } + + fn analyze(&self, data: &[u8], file_size: u64) -> Result { + let audio_start = Self::parse_id3v2_header(data).unwrap_or(0) as u64; + + let audio_end = if Self::has_id3v1_tag(data, file_size) { + file_size - ID3V1_TAG_SIZE as u64 + } else { + file_size + }; + + Ok(FormatLayout { + audio_start, + audio_end, + format: AudioFormat::Mp3, + format_data: None, + }) + } + + fn synthesize( + &self, + metadata: &AudioMeta, + _layout: &FormatLayout, + ) -> Result, FormatError> { + let tag = Self::build_tag_from_meta(metadata); + + let mut buffer = Cursor::new(Vec::new()); + let write_options = WriteOptions::new().preferred_padding(1024); + + tag.dump_to(&mut buffer, write_options) + .map_err(|e| FormatError::SynthesisFailed(e.to_string()))?; + + Ok(buffer.into_inner()) + } + + fn extract(&self, data: &[u8]) -> Result { + let mut cursor = Cursor::new(data); + + let mpeg_file = MpegFile::read_from(&mut cursor, ParseOptions::new()) + .map_err(|e| FormatError::InvalidData(e.to_string()))?; + + let tag = mpeg_file + .id3v2() + .ok_or_else(|| FormatError::InvalidData("No ID3v2 tag found".to_string()))?; + + Ok(Self::extract_from_tag(tag)) + } + + fn estimate_header_size(&self, _metadata: &AudioMeta) -> usize { + 4096 + 1024 + } +} + +fn syncsafe_decode(bytes: &[u8]) -> usize { + ((bytes[0] as usize) << 21) + | ((bytes[1] as usize) << 14) + | ((bytes[2] as usize) << 7) + | (bytes[3] as usize) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_test_meta() -> AudioMeta { + AudioMeta { + title: Some("Test Title".to_string()), + artist: Some("Test Artist".to_string()), + album: Some("Test Album".to_string()), + album_artist: Some("Test Album Artist".to_string()), + genre: Some("Rock".to_string()), + year: Some(2024), + track: Some(5), + track_total: Some(12), + disc: Some(1), + disc_total: Some(2), + format: AudioFormat::Mp3, + date: Some("2024-03-15".to_string()), + composer: Some("Test Composer".to_string()), + comment: Some("Test Comment".to_string()), + lyrics: Some("Test Lyrics\nLine 2".to_string()), + copyright: Some("2024 Test Copyright".to_string()), + compilation: Some(false), + title_sort: Some("Title, Test".to_string()), + artist_sort: Some("Artist, Test".to_string()), + album_sort: Some("Album, Test".to_string()), + album_artist_sort: Some("Album Artist, Test".to_string()), + mb_recording_id: Some("rec-12345".to_string()), + mb_album_id: Some("alb-12345".to_string()), + mb_artist_id: Some("art-12345".to_string()), + mb_album_artist_id: Some("albart-12345".to_string()), + mb_release_group_id: Some("rg-12345".to_string()), + replaygain_track_gain: Some(-6.5), + replaygain_track_peak: Some(0.987654), + replaygain_album_gain: Some(-5.2), + replaygain_album_peak: Some(0.999999), + encoder: Some("LAME 3.100".to_string()), + ..Default::default() + } + } + + #[test] + fn test_id_and_name() { + let handler = Id3v2Handler::new(); + assert_eq!(handler.id(), "id3v2"); + assert_eq!(handler.name(), "ID3v2 (MP3)"); + } + + #[test] + fn test_extensions_and_mime_types() { + let handler = Id3v2Handler::new(); + assert_eq!(handler.extensions(), &["mp3"]); + assert_eq!(handler.mime_types(), &["audio/mpeg"]); + } + + #[test] + fn test_estimate_header_size() { + let handler = Id3v2Handler::new(); + let meta = AudioMeta::default(); + assert_eq!(handler.estimate_header_size(&meta), 5120); + } + + #[test] + fn test_synthesize_creates_valid_id3v2() { + let handler = Id3v2Handler::new(); + let meta = make_test_meta(); + let layout = FormatLayout { + audio_start: 0, + audio_end: 1000, + format: AudioFormat::Mp3, + format_data: None, + }; + + let result = handler.synthesize(&meta, &layout); + assert!(result.is_ok()); + + let bytes = result.unwrap(); + assert!(bytes.len() >= 10); + assert_eq!(&bytes[0..3], b"ID3"); + assert_eq!(bytes[3], 0x04); + } + + #[test] + fn test_analyze_no_id3v2() { + let handler = Id3v2Handler::new(); + let data = vec![0xFF, 0xFB, 0x90, 0x00]; + let file_size = 1000; + + let result = handler.analyze(&data, file_size); + assert!(result.is_ok()); + + let layout = result.unwrap(); + assert_eq!(layout.audio_start, 0); + assert_eq!(layout.audio_end, 1000); + assert_eq!(layout.format, AudioFormat::Mp3); + } + + #[test] + fn test_analyze_with_id3v2() { + let handler = Id3v2Handler::new(); + + let mut data = vec![ + b'I', b'D', b'3', 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, + ]; + data.extend(vec![0u8; 100]); + let file_size = data.len() as u64; + + let result = handler.analyze(&data, file_size); + assert!(result.is_ok()); + + let layout = result.unwrap(); + assert_eq!(layout.audio_start, 110); + assert_eq!(layout.audio_end, file_size); + } + + #[test] + fn test_analyze_with_id3v1() { + let handler = Id3v2Handler::new(); + + let mut data = vec![0xFF, 0xFB, 0x90, 0x00]; + data.extend(vec![0u8; 100]); + data.extend(b"TAG"); + data.extend(vec![0u8; 125]); + let file_size = data.len() as u64; + + let result = handler.analyze(&data, file_size); + assert!(result.is_ok()); + + let layout = result.unwrap(); + assert_eq!(layout.audio_start, 0); + assert_eq!(layout.audio_end, file_size - 128); + } + + #[test] + fn test_syncsafe_decode() { + assert_eq!(syncsafe_decode(&[0x00, 0x00, 0x00, 0x7F]), 127); + assert_eq!(syncsafe_decode(&[0x00, 0x00, 0x01, 0x00]), 128); + assert_eq!(syncsafe_decode(&[0x00, 0x00, 0x00, 0x64]), 100); + } + + #[test] + fn test_parse_track_disc() { + assert_eq!(Id3v2Handler::parse_track_disc("5/12"), (Some(5), Some(12))); + assert_eq!(Id3v2Handler::parse_track_disc("5"), (Some(5), None)); + assert_eq!(Id3v2Handler::parse_track_disc(""), (None, None)); + } + + #[test] + fn test_parse_replaygain_value() { + assert_eq!( + Id3v2Handler::parse_replaygain_value("-6.50 dB"), + Some(-6.50) + ); + assert_eq!(Id3v2Handler::parse_replaygain_value("-6.50dB"), Some(-6.50)); + assert_eq!(Id3v2Handler::parse_replaygain_value("-6.50"), Some(-6.50)); + assert_eq!(Id3v2Handler::parse_replaygain_value("invalid"), None); + } + + #[test] + fn test_empty_metadata_produces_empty_tag() { + let handler = Id3v2Handler::new(); + let meta = AudioMeta::default(); + let layout = FormatLayout { + audio_start: 0, + audio_end: 1000, + format: AudioFormat::Mp3, + format_data: None, + }; + + let result = handler.synthesize(&meta, &layout); + assert!(result.is_ok()); + + let bytes = result.unwrap(); + assert!(bytes.is_empty()); + } + + #[test] + fn test_minimal_metadata_produces_valid_tag() { + let handler = Id3v2Handler::new(); + let mut meta = AudioMeta::default(); + meta.title = Some("Test".to_string()); + let layout = FormatLayout { + audio_start: 0, + audio_end: 1000, + format: AudioFormat::Mp3, + format_data: None, + }; + + let result = handler.synthesize(&meta, &layout); + assert!(result.is_ok()); + + let bytes = result.unwrap(); + assert!(bytes.len() >= 10); + assert_eq!(&bytes[0..3], b"ID3"); + assert_eq!(bytes[3], 0x04); + } + + #[test] + fn test_build_and_extract_tag() { + let original_meta = make_test_meta(); + let tag = Id3v2Handler::build_tag_from_meta(&original_meta); + let extracted = Id3v2Handler::extract_from_tag(&tag); + + assert_eq!(extracted.title, original_meta.title); + assert_eq!(extracted.artist, original_meta.artist); + assert_eq!(extracted.album, original_meta.album); + assert_eq!(extracted.album_artist, original_meta.album_artist); + assert_eq!(extracted.genre, original_meta.genre); + assert_eq!(extracted.track, original_meta.track); + assert_eq!(extracted.track_total, original_meta.track_total); + assert_eq!(extracted.disc, original_meta.disc); + assert_eq!(extracted.disc_total, original_meta.disc_total); + assert_eq!(extracted.composer, original_meta.composer); + assert_eq!(extracted.comment, original_meta.comment); + assert_eq!(extracted.lyrics, original_meta.lyrics); + assert_eq!(extracted.copyright, original_meta.copyright); + assert_eq!(extracted.compilation, original_meta.compilation); + assert_eq!(extracted.title_sort, original_meta.title_sort); + assert_eq!(extracted.artist_sort, original_meta.artist_sort); + assert_eq!(extracted.album_sort, original_meta.album_sort); + assert_eq!(extracted.album_artist_sort, original_meta.album_artist_sort); + assert_eq!(extracted.mb_recording_id, original_meta.mb_recording_id); + assert_eq!(extracted.mb_album_id, original_meta.mb_album_id); + assert_eq!(extracted.mb_artist_id, original_meta.mb_artist_id); + assert_eq!(extracted.mb_album_artist_id, original_meta.mb_album_artist_id); + assert_eq!( + extracted.mb_release_group_id, + original_meta.mb_release_group_id + ); + assert_eq!(extracted.encoder, original_meta.encoder); + + let orig_track_gain = original_meta.replaygain_track_gain.unwrap(); + let ext_track_gain = extracted.replaygain_track_gain.unwrap(); + assert!((orig_track_gain - ext_track_gain).abs() < 0.01); + + let orig_track_peak = original_meta.replaygain_track_peak.unwrap(); + let ext_track_peak = extracted.replaygain_track_peak.unwrap(); + assert!((orig_track_peak - ext_track_peak).abs() < 0.0001); + } +} diff --git a/crates/musicfs-cache/src/handlers/mod.rs b/crates/musicfs-cache/src/handlers/mod.rs new file mode 100644 index 0000000..a2b0934 --- /dev/null +++ b/crates/musicfs-cache/src/handlers/mod.rs @@ -0,0 +1,10 @@ +//! Format-specific metadata handlers for audio file synthesis. +//! +//! Each handler implements the `FormatHandler` trait to support: +//! - Analyzing original files to find audio boundaries +//! - Synthesizing new headers from database metadata +//! - Extracting metadata from existing files + +mod id3v2; + +pub use id3v2::Id3v2Handler; diff --git a/crates/musicfs-cache/src/lib.rs b/crates/musicfs-cache/src/lib.rs index a43d578..d7a8638 100644 --- a/crates/musicfs-cache/src/lib.rs +++ b/crates/musicfs-cache/src/lib.rs @@ -1,6 +1,8 @@ mod artwork; mod db; mod eviction; +mod format_handler; +mod format_layout; mod metadata; mod patterns; mod prefetch; @@ -9,6 +11,8 @@ mod tree; pub use artwork::{ArtworkCache, ArtworkError, CachedArtwork}; pub use db::{Database, TrashedFile, TrashedFilter}; pub use eviction::{EvictionError, EvictionPolicy, LruEviction}; +pub use format_handler::{FormatError, FormatHandler, FormatHandlerRegistry}; +pub use format_layout::FormatLayout; pub use metadata::MetadataCache; pub use patterns::{AccessContext, AccessPattern, PatternError, PatternStore}; pub use prefetch::{PrefetchConfig, PrefetchEngine, PrefetchHandle}; diff --git a/crates/musicfs-cache/src/schema.sql b/crates/musicfs-cache/src/schema.sql index dc87f0f..f7df19b 100644 --- a/crates/musicfs-cache/src/schema.sql +++ b/crates/musicfs-cache/src/schema.sql @@ -20,6 +20,32 @@ CREATE TABLE IF NOT EXISTS files ( bitrate INTEGER, sample_rate INTEGER, format TEXT, + track_total INTEGER, + disc_total INTEGER, + date TEXT, + composer TEXT, + comment TEXT, + lyrics TEXT, + copyright TEXT, + compilation INTEGER, + artist_sort TEXT, + album_artist_sort TEXT, + album_sort TEXT, + title_sort TEXT, + mb_recording_id TEXT, + mb_album_id TEXT, + mb_artist_id TEXT, + mb_album_artist_id TEXT, + mb_release_group_id TEXT, + replaygain_track_gain REAL, + replaygain_track_peak REAL, + replaygain_album_gain REAL, + replaygain_album_peak REAL, + channels INTEGER, + bits_per_sample INTEGER, + encoder TEXT, + custom_tags TEXT, + format_layout BLOB, origin_mtime INTEGER NOT NULL, origin_size INTEGER NOT NULL, @@ -59,6 +85,11 @@ CREATE INDEX IF NOT EXISTS idx_files_content_hash ON files(content_hash); CREATE INDEX IF NOT EXISTS idx_files_real ON files(origin_id, real_path); CREATE INDEX IF NOT EXISTS idx_files_origin ON files(origin_id); CREATE INDEX IF NOT EXISTS idx_files_last_sync ON files(last_sync); +CREATE INDEX IF NOT EXISTS idx_files_mb_album ON files(mb_album_id); +CREATE INDEX IF NOT EXISTS idx_files_mb_artist ON files(mb_artist_id); +CREATE INDEX IF NOT EXISTS idx_files_genre ON files(genre); +CREATE INDEX IF NOT EXISTS idx_files_year ON files(year); +CREATE INDEX IF NOT EXISTS idx_files_composer ON files(composer); CREATE INDEX IF NOT EXISTS idx_artwork_file ON artwork(file_id); CREATE TABLE IF NOT EXISTS directories ( diff --git a/crates/musicfs-core/src/types.rs b/crates/musicfs-core/src/types.rs index 6ce38c6..1249183 100644 --- a/crates/musicfs-core/src/types.rs +++ b/crates/musicfs-core/src/types.rs @@ -132,6 +132,30 @@ pub struct AudioMeta { pub bitrate: Option, pub sample_rate: Option, pub format: AudioFormat, + pub track_total: Option, + pub disc_total: Option, + pub date: Option, + pub composer: Option, + pub comment: Option, + pub lyrics: Option, + pub copyright: Option, + pub compilation: Option, + pub artist_sort: Option, + pub album_artist_sort: Option, + pub album_sort: Option, + pub title_sort: Option, + pub mb_recording_id: Option, + pub mb_album_id: Option, + pub mb_artist_id: Option, + pub mb_album_artist_id: Option, + pub mb_release_group_id: Option, + pub replaygain_track_gain: Option, + pub replaygain_track_peak: Option, + pub replaygain_album_gain: Option, + pub replaygain_album_peak: Option, + pub channels: Option, + pub bits_per_sample: Option, + pub encoder: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/crates/musicfs-test-utils/src/fixtures.rs b/crates/musicfs-test-utils/src/fixtures.rs index a41504e..c5870c5 100644 --- a/crates/musicfs-test-utils/src/fixtures.rs +++ b/crates/musicfs-test-utils/src/fixtures.rs @@ -50,6 +50,7 @@ pub fn make_audio_meta(artist: &str, album: &str, title: &str) -> AudioMeta { bitrate: Some(320), sample_rate: Some(44100), format: AudioFormat::Flac, + ..Default::default() } } diff --git a/docs/v2/features/metadata-overlay.md b/docs/v2/features/metadata-overlay.md new file mode 100644 index 0000000..60fd3e7 --- /dev/null +++ b/docs/v2/features/metadata-overlay.md @@ -0,0 +1,1010 @@ +# Metadata Overlay: Design Doc + +**Authors:** AI Assistant +**Status:** Draft +**Last Updated:** 2026-05-17 +**Reviewers:** [TBD] +**Approvers:** [TBD] +**Prerequisites:** [architecture.md](../architecture.md), [requirements.md](../requirements.md) + +--- + +[TOC] + +--- + +## 1. Abstract + +Metadata Overlay enables MusicFS to serve **modified audio metadata** to +consumers (Jellyfin, Plex, mpv, VLC) while preserving original files on origin +storage. When a media server reads a file through the FUSE mount, it receives +metadata headers **generated on-the-fly** from the database, seamlessly spliced +with original audio data from the origin. + +**Key constraints:** +- Never modify origin files (read-only architecture) +- Never duplicate entire files (storage-efficient) +- Support all audio formats via pluggable format handlers +- Transparent to consumers (standard file I/O) + +**Solution approach:** Store metadata as individual database columns. On +`read()`, generate format-specific headers on-the-fly (~10-50 μs) and splice +them with original audio bytes using offset translation. No pre-generated +headers are stored. + +--- + +## 2. Background + +### 2.1 Current State + +MusicFS serves files with their **original embedded metadata**. The metadata +extraction flow is: + +``` +Origin File → symphonia parser → AudioMeta struct → SQLite DB → Virtual paths + ↓ +FUSE read() ← CAS chunks ← Origin (unchanged bytes) +``` + +The database stores metadata for virtual path generation and search, but file +content is served verbatim from origin/CAS. Only 12 metadata fields are +stored: title, artist, album, album_artist, genre, year, track, disc, +duration_ms, bitrate, sample_rate, format. + +### 2.2 Pain Points + +| Problem | Impact | +|---------|--------| +| Cannot fix incorrect tags | Jellyfin shows wrong artist/album | +| Cannot add missing metadata | Files with no tags appear as "Unknown" | +| Origin is authoritative | User edits require modifying source files | +| Breaks torrent seeding | Modifying origin invalidates checksums | +| Missing fields | Only 12 of ~30 standard fields stored | + +### 2.3 User Stories + +1. **Tag Correction:** "Origin files have 'The Beatles' tagged as 'Beatles, + The'. I want Jellyfin to see the correct name without modifying my NAS." + +2. **Missing Metadata:** "My FLAC rips have no album art or year. I want to + add them in MusicFS so Plex displays proper covers." + +3. **Torrent Preservation:** "My music is seeding. I can't modify files but + want correct tags in my media server." + +4. **Multi-Library Views:** "I want one physical file to appear in both + 'Classical' and 'Relaxation' collections with different metadata." + +--- + +## 3. Goals & Non-Goals + +### 3.1 Goals + +| ID | Goal | Success Metric | +|----|------|----------------| +| G1 | Serve modified metadata transparently | Players read edited tags without special handling | +| G2 | Zero origin modification | Origin files byte-identical before/after | +| G3 | Zero storage overhead for headers | No pre-generated header blobs stored | +| G4 | MP3 and FLAC out of the box | Other formats added on demand via plugins | +| G5 | Pluggable format handlers | Add new format support without core changes | +| G6 | Unified metadata model | Single API regardless of underlying format | +| G7 | Sub-second edit latency | Metadata changes reflected on next read | + +### 3.2 Non-Goals + +| ID | Non-Goal | Rationale | +|----|----------|-----------| +| NG1 | Audio transcoding | Out of scope; separate feature | +| NG2 | Lossless round-trip | Synthesized headers may differ structurally from original | +| NG3 | Writing back to origin | Violates read-only principle | +| NG4 | Video file support | Focus on audio; defer to future | +| NG5 | Metadata sync to external DBs | Jellyfin/Plex have their own; not our concern | + +--- + +## 4. Proposed Design + +### 4.1 High-Level Architecture + +```plantuml +@startuml +!theme plain +skinparam componentStyle rectangle + +package "FUSE Layer" { + [getattr()] as GA + [read()] as RD +} + +package "Overlay Engine" { + [OverlayReader] as OR + [FormatHandlerRegistry] as FHR +} + +package "Storage" { + database "SQLite\n(metadata columns\n+ format_layout)" as DB + [CAS\n(origin audio chunks)] as CAS +} + +package "Format Handlers (Pluggable)" { + [Id3v2Handler] as H1 + [FlacHandler] as H2 + [WavHandler\n(on demand)] as H3 + [OggHandler\n(on demand)] as H4 + [Mp4Handler\n(on demand)] as H5 +} + +GA --> OR : virtual_size? +RD --> OR : read(ino, offset, size) +OR --> DB : get metadata + layout +OR --> FHR : synthesize(metadata, layout) +FHR --> H1 +FHR --> H2 +FHR --> H3 +FHR --> H4 +FHR --> H5 +OR --> CAS : read audio bytes + +note right of OR + On-the-fly generation: + 1. Read metadata from DB columns + 2. Generate header (~10-50 μs) + 3. Splice header + CAS audio + 4. Return to FUSE +end note + +@enduml +``` + +### 4.2 Core Flows + +#### 4.2.1 Flow: Initial Ingest (Origin Scan) + +Triggered on mount or rescan. Extracts metadata from origin files and +populates all database columns. + +```plantuml +@startuml +!theme plain + +participant "Origin\nFederation" as OF +participant "CAS" as CAS +participant "Format\nHandler" as FH +participant "Metadata\nParser" as MP +database "SQLite" as DB +participant "Tantivy" as TI +participant "Virtual\nTree" as VT + +OF -> OF : Scan origin directory +loop for each audio file + OF -> CAS : Fetch file header (first 256KB) + CAS -> CAS : Chunk and store full file + CAS --> OF : ChunkManifest + + OF -> FH : analyze(header_bytes, file_size) + note right of FH + Detects format, returns + FormatLayout with audio_start, + audio_end, format_data + (e.g. STREAMINFO for FLAC) + end note + FH --> OF : FormatLayout + + OF -> MP : extract(header_bytes) + note right of MP + Uses symphonia to parse + all embedded tags + end note + MP --> OF : metadata fields + + OF -> DB : INSERT INTO files\n(all metadata columns,\nformat_layout, chunk_manifest) + OF -> TI : Index metadata + OF -> VT : Add virtual tree node +end + +@enduml +``` + +#### 4.2.2 Flow: FUSE read() with Overlay + +The core read path. Headers are generated on-the-fly from DB columns — +nothing pre-computed is stored. + +```plantuml +@startuml +!theme plain + +participant "FUSE\nKernel" as FK +participant "Overlay\nReader" as OR +database "SQLite" as DB +participant "Format\nHandler" as FH +participant "CAS" as CAS + +FK -> OR : read(ino, offset, size) +OR -> OR : Lookup file by inode + +OR -> DB : SELECT metadata columns,\nformat_layout WHERE id = ? +note right of DB : ~1 μs via page cache +DB --> OR : FileMetadataRow + +OR -> FH : synthesize(metadata, layout) +note right of FH + On-the-fly generation + ~10-50 μs, pure CPU +end note +FH --> OR : synthetic_header bytes + +OR -> OR : header_len = synthetic_header.len()\nvirtual_size = header_len + audio_len + +alt offset falls in header region + OR -> OR : Slice from synthetic_header +else offset falls in audio region + OR -> OR : origin_offset = audio_start\n+ (offset - header_len) + OR -> CAS : read(file_id, origin_offset, size) + CAS --> OR : audio bytes +else offset spans boundary + OR -> OR : Take header tail + OR -> CAS : read(file_id, audio_start, remaining) + CAS --> OR : audio bytes + OR -> OR : Concatenate header + audio +end + +OR --> FK : reply.data(spliced bytes) + +@enduml +``` + +#### 4.2.3 Flow: FUSE getattr() with Overlay + +Returns the **virtual** file size (synthetic header + audio) instead of +the origin file size. + +```plantuml +@startuml +!theme plain + +participant "FUSE\nKernel" as FK +participant "Virtual\nTree" as VT +database "SQLite" as DB +participant "Format\nHandler" as FH + +FK -> VT : getattr(ino) +VT -> VT : Lookup VirtualNode + +VT -> DB : Read format_layout for file_id +DB --> VT : FormatLayout +VT -> FH : estimate_header_size(metadata) +note right of FH + Fast estimate without + full header synthesis +end note +FH --> VT : estimated_header_len +VT -> VT : virtual_size = estimated_header_len\n+ (audio_end - audio_start) +VT --> FK : FileAttr with virtual_size + +@enduml +``` + +#### 4.2.4 Flow: Metadata Update (User Edits Tags) + +Triggered via gRPC API or CLI. Updates DB columns directly. Next read() +generates a new header automatically. + +```plantuml +@startuml +!theme plain + +actor "User" as U +participant "CLI /\ngRPC" as API +participant "Metadata\nService" as MS +database "SQLite" as DB +participant "Tantivy" as TI +participant "Virtual\nTree" as VT +participant "Event\nBus" as EB + +U -> API : musicfs metadata set\n--title "Fix" --artist "Fix" +API -> MS : UpdateMetadata(file_id, fields) + +alt merge mode (default) + MS -> DB : SELECT current metadata + DB --> MS : current row + MS -> MS : Overwrite only provided fields +end + +MS -> DB : UPDATE files SET title=?, artist=?\nWHERE id=? +DB --> MS : ok + +MS -> TI : Re-index updated file +MS -> VT : Recompute virtual path +note right of VT + If artist/album/title changed + the file moves in the tree +end note + +MS -> EB : Emit MetadataChanged +note right of EB + FUSE attr cache invalidation, + gRPC event subscribers +end note + +MS --> API : success +API --> U : done + +@enduml +``` + +#### 4.2.5 Flow: Metadata Clear (Revert to Original) + +Removes user overrides. File reverts to serving original embedded metadata. + +```plantuml +@startuml +!theme plain + +actor "User" as U +participant "CLI /\ngRPC" as API +participant "Metadata\nService" as MS +participant "CAS" as CAS +participant "Metadata\nParser" as MP +database "SQLite" as DB +participant "Tantivy" as TI +participant "Virtual\nTree" as VT +participant "Event\nBus" as EB + +U -> API : musicfs metadata clear +API -> MS : ClearOverlay(file_id) + +MS -> CAS : Read origin file header +CAS --> MS : header bytes +MS -> MP : extract(header_bytes) +MP --> MS : original metadata + +MS -> DB : UPDATE files SET all columns\nto original values +DB --> MS : ok + +MS -> TI : Re-index with original metadata +MS -> VT : Recompute virtual path +MS -> EB : Emit MetadataCleared + +MS --> API : success +API --> U : done + +@enduml +``` + +#### 4.2.6 Flow: Batch Import + +Import metadata from external source (CSV, JSON, MusicBrainz dump). + +```plantuml +@startuml +!theme plain + +actor "User" as U +participant "CLI /\ngRPC" as API +participant "Import\nEngine" as IE +database "SQLite" as DB +participant "Tantivy" as TI +participant "Event\nBus" as EB + +U -> API : musicfs metadata import\n--format csv metadata.csv +API -> IE : ImportMetadata(file, format) + +IE -> IE : Parse source file (CSV/JSON) +IE -> IE : Match rows to files by\npath, ISRC, or MusicBrainz ID + +IE -> DB : BEGIN TRANSACTION + +loop for each matched row + IE -> DB : UPDATE files SET matched columns + IE -> TI : Re-index file + IE --> API : stream progress +end + +IE -> DB : COMMIT + +IE -> EB : Emit BatchImportComplete + +IE --> API : final summary +API --> U : updated N, skipped M, errors K + +@enduml +``` + +### 4.3 Offset Translation + +``` +Virtual File (what consumer sees): +┌─────────────────────┬────────────────────────────────────────────┐ +│ Synthetic Header │ Original Audio │ +│ (N bytes) │ (M bytes) │ +│ generated on-fly │ from CAS │ +└─────────────────────┴────────────────────────────────────────────┘ +0 N N+M + ↑ ↑ + header_len virtual_size + +Origin File (on storage): +┌─────────────────────┬────────────────────────────────────────────┐ +│ Original Header │ Original Audio │ +│ (X bytes) │ (M bytes) │ +└─────────────────────┴────────────────────────────────────────────┘ +0 X X+M + ↑ ↑ + layout.audio_start layout.audio_end + +Offset Translation: + virtual_offset → origin_offset + + if virtual_offset < N: + return synthetic_header[virtual_offset] + else: + origin_offset = X + (virtual_offset - N) + return cas_read(file_id, origin_offset) +``` + +### 4.4 Format Handler Plugin System + +#### 4.4.1 Handler Trait + +```rust +/// Trait for format-specific metadata handling. +/// +/// Implementations handle: +/// 1. Analyzing original files to find audio boundaries +/// 2. Synthesizing new headers from database metadata +/// +/// Plugins implement this trait and register via FormatHandlerRegistry. +pub trait FormatHandler: Send + Sync + 'static { + fn id(&self) -> &'static str; + fn name(&self) -> &'static str; + fn extensions(&self) -> &[&'static str]; + fn mime_types(&self) -> &[&'static str]; + + /// Analyze file bytes to determine audio layout. + fn analyze(&self, data: &[u8], file_size: u64) -> Result; + + /// Synthesize header bytes from metadata. Called on every read(). + fn synthesize( + &self, + metadata: &FileMetadataRow, + layout: &FormatLayout, + ) -> Result, FormatError>; + + /// Extract metadata from header bytes (for initial ingest). + fn extract(&self, data: &[u8]) -> Result; + + /// Estimate header size without full synthesis (for getattr). + fn estimate_header_size(&self, metadata: &FileMetadataRow) -> usize { + 10 * 1024 // 10KB default + } +} +``` + +#### 4.4.2 Handler Registry + +```rust +pub struct FormatHandlerRegistry { + handlers: HashMap>, + extension_map: HashMap, +} + +impl FormatHandlerRegistry { + pub fn new() -> Self { + // Only MP3 and FLAC shipped by default. + // Other handlers registered via load_plugins() or register(). + let mut r = Self { .. }; + r.register(Arc::new(Id3v2Handler::new())); // .mp3 + r.register(Arc::new(FlacHandler::new())); // .flac + r + } + pub fn register(&mut self, handler: Arc) { /* ... */ } + pub fn get_by_extension(&self, ext: &str) -> Option> { /* ... */ } + pub fn load_plugins(&mut self, plugin_dir: &Path) -> Result { /* ... */ } +} +``` + +#### 4.4.3 Format Complexity Summary + +| Format | Handler | Complexity | Shipped | +|--------|---------|------------|---------| +| **MP3** | `Id3v2Handler` | Low | **Yes** — built-in | +| **FLAC** | `FlacHandler` | Low | **Yes** — built-in | +| **WAV** | `WavHandler` | Low | On demand | +| **OGG/Opus** | `OggHandler` | Medium | On demand | +| **M4A/MP4** | `Mp4Handler` | High | On demand | + +MP3 and FLAC cover the vast majority of music libraries. Other formats +use the same `FormatHandler` trait and can be added as plugins or built-in +handlers when needed — the architecture does not change. + +### 4.5 Database Schema + +All metadata fields are individual columns. SQLite NULL columns cost 0 bytes. +Only `format_layout` and `custom_tags` are blobs. + +```sql +PRAGMA journal_mode = WAL; +PRAGMA foreign_keys = ON; +PRAGMA synchronous = NORMAL; + +CREATE TABLE IF NOT EXISTS files ( + id INTEGER PRIMARY KEY, + origin_id TEXT NOT NULL, + real_path TEXT NOT NULL, + virtual_path TEXT NOT NULL, + + -- ═══ Core Identification ═══ + title TEXT, + artist TEXT, + album TEXT, + album_artist TEXT, + track_number INTEGER, + track_total INTEGER, + disc_number INTEGER, + disc_total INTEGER, + date TEXT, -- "2024" or "2024-05-17" + year INTEGER, -- extracted for convenience + genre TEXT, + + -- ═══ Credits ═══ + composer TEXT, + comment TEXT, + lyrics TEXT, + copyright TEXT, + compilation INTEGER, -- 0/1 + + -- ═══ Sorting ═══ + artist_sort TEXT, + album_artist_sort TEXT, + album_sort TEXT, + title_sort TEXT, + + -- ═══ MusicBrainz IDs ═══ + mb_recording_id TEXT, -- Recording MBID + mb_album_id TEXT, -- Release MBID + mb_artist_id TEXT, -- Artist MBID + mb_album_artist_id TEXT, -- Album Artist MBID + mb_release_group_id TEXT, -- Release Group MBID + + -- ═══ ReplayGain ═══ + replaygain_track_gain REAL, -- dB + replaygain_track_peak REAL, -- 0.0-1.0+ + replaygain_album_gain REAL, + replaygain_album_peak REAL, + + -- ═══ Technical (from audio stream, read-only) ═══ + duration_ms INTEGER, + bitrate INTEGER, -- kbps + sample_rate INTEGER, -- Hz + channels INTEGER, + bits_per_sample INTEGER, + format TEXT, -- "flac", "mp3", etc. + encoder TEXT, -- encoding software + + -- ═══ Custom Tags (overflow for non-standard fields) ═══ + custom_tags TEXT, -- JSON: {"ISRC":"US1234","LABEL":"Sony"} + + -- ═══ Format Layout (for byte-range splicing) ═══ + -- Stored as msgpack blob. Contains audio_start, audio_end, + -- format_data (STREAMINFO for FLAC, stco for MP4, etc.) + format_layout BLOB, + + -- ═══ Sync State ═══ + origin_mtime INTEGER NOT NULL, + origin_size INTEGER NOT NULL, + content_hash TEXT, + chunk_manifest BLOB, + last_sync INTEGER NOT NULL DEFAULT (strftime('%s', 'now')), + + -- ═══ Trash (existing feature) ═══ + trashed INTEGER NOT NULL DEFAULT 0, + original_path TEXT, + trashed_at INTEGER, + + UNIQUE(origin_id, real_path) +); + +-- ═══ Indexes ═══ +CREATE INDEX IF NOT EXISTS idx_files_virtual ON files(virtual_path); +CREATE INDEX IF NOT EXISTS idx_files_artist_album ON files(artist, album); +CREATE INDEX IF NOT EXISTS idx_files_content_hash ON files(content_hash); +CREATE INDEX IF NOT EXISTS idx_files_real ON files(origin_id, real_path); +CREATE INDEX IF NOT EXISTS idx_files_origin ON files(origin_id); +CREATE INDEX IF NOT EXISTS idx_files_last_sync ON files(last_sync); + +CREATE INDEX IF NOT EXISTS idx_files_trashed ON files(trashed) WHERE trashed = 1; +CREATE INDEX IF NOT EXISTS idx_files_mb_album ON files(mb_album_id); +CREATE INDEX IF NOT EXISTS idx_files_mb_artist ON files(mb_artist_id); +CREATE INDEX IF NOT EXISTS idx_files_genre ON files(genre); +CREATE INDEX IF NOT EXISTS idx_files_year ON files(year); +CREATE INDEX IF NOT EXISTS idx_files_composer ON files(composer); + +-- ═══ Artwork (unchanged, separate table) ═══ +CREATE TABLE IF NOT EXISTS artwork ( + id INTEGER PRIMARY KEY, + file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE, + art_type TEXT NOT NULL, + chunk_hash TEXT NOT NULL, + width INTEGER, + height INTEGER, + mime_type TEXT, + UNIQUE(file_id, art_type) +); + +CREATE INDEX IF NOT EXISTS idx_artwork_file ON artwork(file_id); + +-- ═══ Collections (unchanged) ═══ +CREATE TABLE IF NOT EXISTS collections ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL UNIQUE, + query_json TEXT NOT NULL, + created_at INTEGER NOT NULL DEFAULT (strftime('%s', 'now')), + updated_at INTEGER NOT NULL DEFAULT (strftime('%s', 'now')) +); + +-- ═══ Directories (unchanged) ═══ +CREATE TABLE IF NOT EXISTS directories ( + id INTEGER PRIMARY KEY, + path TEXT NOT NULL UNIQUE, + created_at INTEGER NOT NULL DEFAULT (strftime('%s', 'now')) +); + +CREATE INDEX IF NOT EXISTS idx_directories_path ON directories(path); +``` + +### 4.6 Read Algorithm + +```rust +impl OverlayReader { + pub async fn read( + &self, + file_id: FileId, + offset: u64, + size: u32, + ) -> Result { + let file = self.db.get_file(file_id)?; + let layout: FormatLayout = rmp_serde::from_slice(&file.format_layout)?; + let handler = self.registry.get_by_format(&file.format)?; + + // Generate header on-the-fly (~10-50 μs) + let header = handler.synthesize(&file, &layout)?; + let header_len = header.len() as u64; + let audio_len = layout.audio_end - layout.audio_start; + let virtual_size = header_len + audio_len; + let virtual_end = (offset + size as u64).min(virtual_size); + + if offset >= virtual_size { + return Ok(Bytes::new()); + } + + let mut result = BytesMut::with_capacity((virtual_end - offset) as usize); + + // Region 1: Synthetic header + if offset < header_len { + let end = virtual_end.min(header_len); + result.extend_from_slice(&header[offset as usize..end as usize]); + } + + // Region 2: Origin audio data + if virtual_end > header_len { + let audio_start = header_len.max(offset) - header_len; + let audio_size = (virtual_end - header_len.max(offset)) as u32; + let origin_offset = layout.audio_start + audio_start; + + let audio = self.cas_reader.read(file_id, origin_offset, audio_size).await?; + result.extend_from_slice(&audio); + } + + Ok(result.freeze()) + } +} +``` + +### 4.7 API Design + +#### 4.7.1 gRPC Extensions + +```protobuf +service MetadataService { + rpc GetMetadata(GetMetadataRequest) returns (MetadataResponse); + rpc UpdateMetadata(UpdateMetadataRequest) returns (UpdateMetadataResponse); + rpc ClearOverlay(ClearOverlayRequest) returns (ClearOverlayResponse); + rpc BatchUpdateMetadata(BatchUpdateRequest) returns (stream BatchUpdateProgress); + rpc ImportMetadata(ImportMetadataRequest) returns (stream ImportProgress); +} + +message UpdateMetadataRequest { + int64 file_id = 1; + // Only set fields you want to change. + // Unset fields are left as-is (merge behavior). + optional string title = 2; + optional string artist = 3; + optional string album = 4; + optional string album_artist = 5; + optional uint32 track_number = 6; + optional uint32 disc_number = 7; + optional string date = 8; + optional string genre = 9; + optional string composer = 10; + optional string comment = 11; + optional string lyrics = 12; + optional string copyright = 13; + optional bool compilation = 14; + optional string artist_sort = 15; + optional string album_artist_sort = 16; + optional string album_sort = 17; + optional string title_sort = 18; + optional string mb_recording_id = 20; + optional string mb_album_id = 21; + optional string mb_artist_id = 22; + optional float replaygain_track_gain = 30; + optional float replaygain_track_peak = 31; + optional float replaygain_album_gain = 32; + optional float replaygain_album_peak = 33; + map custom_tags = 50; +} +``` + +#### 4.7.2 CLI Interface + +Two ways to set metadata: **flags** for quick single-field edits, **JSON** +for bulk or complex updates. Both can be combined. + +```bash +# ── View ── + +# Print all metadata as JSON +musicfs metadata get "/Artist/Album/01 - Track.flac" + +# Print specific field +musicfs metadata get "/Artist/Album/01 - Track.flac" --field artist + +# ── Edit via flags (one field at a time or several) ── + +musicfs metadata set "/Artist/Album/01 - Track.flac" \ + --title "Corrected Title" + +musicfs metadata set "/Artist/Album/01 - Track.flac" \ + --artist "Corrected Artist" \ + --album-artist "Corrected Artist" \ + --year 2024 \ + --genre "Rock" + +# Every DB column has a corresponding flag: +# --title, --artist, --album, --album-artist, +# --track-number, --track-total, --disc-number, --disc-total, +# --date, --year, --genre, +# --composer, --comment, --lyrics, --copyright, --compilation, +# --artist-sort, --album-artist-sort, --album-sort, --title-sort, +# --mb-recording-id, --mb-album-id, --mb-artist-id, +# --mb-album-artist-id, --mb-release-group-id, +# --replaygain-track-gain, --replaygain-track-peak, +# --replaygain-album-gain, --replaygain-album-peak, +# --encoder + +# Set a custom tag (anything not in the standard set) +musicfs metadata set "/path/to/file" --custom ISRC=US1234567890 + +# ── Edit via JSON (any number of fields at once) ── + +# Inline JSON +musicfs metadata set "/Artist/Album/01 - Track.flac" --json '{ + "title": "Corrected Title", + "artist": "Corrected Artist", + "year": 2024, + "custom_tags": {"ISRC": "US1234567890", "LABEL": "Sony"} +}' + +# From file +musicfs metadata set "/Artist/Album/01 - Track.flac" --json @metadata.json + +# Flags and JSON can be combined (flags take precedence) +musicfs metadata set "/path/to/file" --json @base.json --year 2025 + +# ── Revert ── + +# Revert to original embedded metadata +musicfs metadata clear "/Artist/Album/01 - Track.flac" + +# ── Diff ── + +# Show what changed vs original +musicfs metadata diff "/Artist/Album/01 - Track.flac" + +# ── Batch ── + +# Import from CSV (columns map to field names) +musicfs metadata import --format csv metadata.csv + +# Import from JSON (array of objects with "path" or "file_id" key) +musicfs metadata import --format json metadata.json + +# Export +musicfs metadata export --output metadata.json +musicfs metadata export --query "artist:Beatles" --output beatles.json +``` + +--- + +## 5. Cross-Cutting Concerns + +### 5.1 Security & Privacy + +| Concern | Mitigation | +|---------|------------| +| Plugin isolation | Native plugins require explicit trust; future WASM sandboxing | +| No credential exposure | Overlays contain only metadata, never auth tokens | +| Backup/restore | All data in SQLite, included in standard backup | + +### 5.2 Observability + +**Metrics:** +``` +musicfs_overlay_files_modified # Files with user-edited metadata +musicfs_overlay_generation_us # Histogram: header generation time +musicfs_overlay_read_total # Reads served via overlay +``` + +**Logging:** +``` +INFO overlay.update file_id=123 fields=[title,artist] +DEBUG overlay.read file_id=123 offset=0 size=65536 generation_us=42 +WARN overlay.format file_id=456 error="No handler for format=opus" +``` + +### 5.3 Scalability & Performance + +| Metric | Target | Notes | +|--------|--------|-------| +| Header generation | <100 μs | ~10-50 μs typical, pure CPU | +| read() overhead vs passthrough | <5% | One DB read + one synthesize | +| getattr() overhead | <1 μs | estimate_header_size(), no full synthesis | +| Storage per file | 0 extra | Metadata already in columns | +| Memory (LRU cache) | Optional | Cache hot headers if profiling shows need | + +### 5.4 Testing Plan + +| Test Type | Coverage | +|-----------|----------| +| **Unit** | FormatHandler implementations, offset arithmetic | +| **Integration** | Full read path with overlays, DB round-trip | +| **Format Matrix** | Each format × {overlay on, overlay off} | +| **Fuzzing** | Malformed files, boundary offsets, huge metadata | +| **Player Compat** | mpv, VLC, Jellyfin, Plex, ffprobe | + +--- + +## 6. Alternatives Considered + +### 6.1 Alternative A: Pre-generate and Store Headers in DB + +**Description:** Synthesize headers on metadata update, store as BLOB. + +**Rejected Because:** +- 1-10 KB per file × 1M files = 1-10 GB unnecessary storage +- Cache invalidation complexity (must regenerate on any field change) +- Generation is <100 μs — faster than a SQLite BLOB read of that size +- More moving parts for no measurable benefit + +### 6.2 Alternative B: NFO Sidecar Files + +**Description:** Generate `.nfo` XML files alongside audio files. + +**Rejected Because:** +- Only works with players that support NFO (Jellyfin, Plex) +- mpv, VLC, foobar2000 read embedded tags only +- Not transparent to all consumers + +### 6.3 Alternative C: Full File Rewrite + CAS Cache + +**Description:** Rewrite entire file with new metadata, cache in CAS. + +**Rejected Because:** +- Doubles storage for modified files +- High CPU/memory on first access +- Defeats CAS deduplication + +### 6.4 Alternative D: Metadata Blobs Instead of Columns + +**Description:** Store metadata as a single msgpack/JSON blob per file. + +**Rejected Because:** +- Not directly queryable (no `WHERE artist = ?`) +- Not indexable +- SQLite NULL columns cost 0 bytes — no space savings from blobs +- Schema is self-documenting with columns +- Virtual path templates can reference any column directly + +--- + +## 7. Implementation Plan + +### 7.1 Phase 1: Schema Migration + Core Types (3 days) + +| Deliverable | Details | +|-------------|---------| +| Schema migration | Add new columns to files table | +| `FormatLayout` struct | Audio boundary description | +| `FormatHandler` trait | Plugin interface | +| `FormatHandlerRegistry` | Built-in handler registration | + +**Exit Criteria:** DB migrates cleanly, types compile. + +### 7.2 Phase 2: Ingest Pipeline Update (3 days) + +| Deliverable | Details | +|-------------|---------| +| Update symphonia parser | Extract all new fields | +| Format analysis on ingest | Run `analyze()` → store `format_layout` | +| Populate new DB columns | All fields written on scan | + +**Exit Criteria:** Full rescan populates all metadata columns. + +### 7.3 Phase 3: Read Path + MP3/FLAC (5 days) + +| Deliverable | Details | +|-------------|---------| +| `OverlayReader` | Splice logic in FUSE read() | +| `Id3v2Handler` | analyze + synthesize for MP3 | +| `FlacHandler` | analyze + synthesize for FLAC | +| FUSE getattr() | Return virtual_size | + +**Exit Criteria:** ffprobe/mpv reads modified MP3 and FLAC tags. + +### 7.4 Phase 4: API + CLI (3 days) + +| Deliverable | Details | +|-------------|---------| +| gRPC MetadataService | get, set, clear, batch, import | +| CLI commands | `musicfs metadata {get,set,clear,diff,import,export}` | + +**Exit Criteria:** Full API functional end-to-end. + +### 7.6 Rollout + +```toml +[experimental] +metadata_overlay = true # Enable overlay feature + +[metadata_overlay] +# Additional format handlers loaded from this directory +plugin_dir = "/etc/musicfs/format-plugins/" +``` + +**Files with no registered handler** for their format are served with +original bytes unchanged (passthrough). No error, no degradation. + +--- + +## 8. Glossary & References + +### 8.1 Glossary + +| Term | Definition | +|------|------------| +| **Overlay** | Mode where file serves user-edited metadata instead of original | +| **Synthetic Header** | Format-specific metadata bytes generated on-the-fly | +| **Format Layout** | Description of audio/metadata byte boundaries in origin file | +| **Offset Translation** | Converting virtual file offset to origin file offset | + +### 8.2 References + +| Document | Link | +|----------|------| +| ID3v2.4 Specification | https://id3.org/id3v2.4.0-structure | +| FLAC Format | https://xiph.org/flac/format.html | +| OGG Encapsulation | https://xiph.org/ogg/doc/rfc3533.txt | +| MP4 Specification | ISO/IEC 14496-12 | +| MusicBrainz Picard Tag Mapping | https://picard-docs.musicbrainz.org/en/appendices/tag_mapping.html | +| symphonia StandardTagKey | https://docs.rs/symphonia-core/0.5.4/symphonia_core/meta/enum.StandardTagKey.html | +| lofty-rs | https://github.com/Serial-ATA/lofty-rs | +| MusicFS Architecture | [architecture.md](../architecture.md) | + +### 8.3 New Dependencies + +| Crate | Version | Purpose | +|-------|---------|---------| +| lofty | 0.24+ | Metadata header generation (all formats) |