diff --git a/CHANGELOG.md b/CHANGELOG.md index 7175880d1..b17f720ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Vorbis Comments**: Check `ALBUM ARTIST` for `ItemKey::AlbumArtist` conversions * **ItemKey**: `ItemKey` is now `Copy` ([PR](https://github.com/Serial-ATA/lofty-rs/pull/526)) +### Fixed +- **ID3v2**: Support parsing UTF-16 `COMM`/`USLT` frames with a single BOM ([issue](https://github.com/Serial-ATA/lofty-rs/issues/532)) ([PR](https://github.com/Serial-ATA/lofty-rs/pull/535)) + - Some encoders will only write a BOM to the frame's description, rather than to every string in the frame. + This was previously only supported in `SYLT` frames, and has been extended to `COMM` and `USLT`. + ### Removed * **ItemKey**: `ItemKey::Unknown` ([PR](https://github.com/Serial-ATA/lofty-rs/pull/526)) diff --git a/lofty/src/id3/v2/items/language_frame.rs b/lofty/src/id3/v2/items/language_frame.rs index 3cde0a5c0..74fd43dbe 100644 --- a/lofty/src/id3/v2/items/language_frame.rs +++ b/lofty/src/id3/v2/items/language_frame.rs @@ -3,7 +3,10 @@ use crate::id3::v2::frame::content::verify_encoding; use crate::id3::v2::header::Id3v2Version; use crate::id3::v2::{FrameFlags, FrameHeader, FrameId}; use crate::tag::items::Lang; -use crate::util::text::{TextDecodeOptions, TextEncoding, decode_text, encode_text}; +use crate::util::text::{ + DecodeTextResult, TextDecodeOptions, TextEncoding, decode_text, encode_text, + utf16_decode_terminated_maybe_bom, +}; use std::borrow::Cow; use std::hash::{Hash, Hasher}; @@ -35,12 +38,34 @@ impl LanguageFrame { let mut language = [0; 3]; reader.read_exact(&mut language)?; - let description = decode_text( + let DecodeTextResult { + content: description, + bom, + .. + } = decode_text( reader, TextDecodeOptions::new().encoding(encoding).terminated(true), - )? - .content; - let content = decode_text(reader, TextDecodeOptions::new().encoding(encoding))?.content; + )?; + + let mut endianness: fn([u8; 2]) -> u16 = u16::from_le_bytes; + + // It's possible for the description to be the only string with a BOM + // To be safe, we change the encoding to the concrete variant determined from the description + if encoding == TextEncoding::UTF16 { + endianness = match bom { + [0xFF, 0xFE] => u16::from_le_bytes, + [0xFE, 0xFF] => u16::from_be_bytes, + // The BOM has to be valid for `decode_text` to succeed + _ => unreachable!("Bad BOM {bom:?}"), + }; + } + + let content; + if encoding == TextEncoding::UTF16 { + (content, _) = utf16_decode_terminated_maybe_bom(reader, endianness)?; + } else { + content = decode_text(reader, TextDecodeOptions::new().encoding(encoding))?.content; + } Ok(Some(Self { encoding, diff --git a/lofty/src/id3/v2/items/sync_text.rs b/lofty/src/id3/v2/items/sync_text.rs index 7d3511634..bdf911fa0 100644 --- a/lofty/src/id3/v2/items/sync_text.rs +++ b/lofty/src/id3/v2/items/sync_text.rs @@ -2,12 +2,12 @@ use crate::error::{ErrorKind, Id3v2Error, Id3v2ErrorKind, LoftyError, Result}; use crate::id3::v2::{FrameFlags, FrameHeader, FrameId}; use crate::macros::err; use crate::util::text::{ - TextDecodeOptions, TextEncoding, decode_text, encode_text, read_to_terminator, - utf16_decode_bytes, + DecodeTextResult, TextDecodeOptions, TextEncoding, decode_text, encode_text, + utf16_decode_terminated_maybe_bom, }; use std::borrow::Cow; -use std::io::{Cursor, Read, Seek, SeekFrom, Write}; +use std::io::{Cursor, Seek, Write}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; @@ -148,23 +148,26 @@ impl SynchronizedTextFrame<'_> { .ok_or_else(|| Id3v2Error::new(Id3v2ErrorKind::BadSyncText))?; let mut cursor = Cursor::new(&data[6..]); - let description = crate::util::text::decode_text( + let DecodeTextResult { + content: description, + bom, + .. + } = decode_text( &mut cursor, TextDecodeOptions::new().encoding(encoding).terminated(true), ) - .map_err(|_| Id3v2Error::new(Id3v2ErrorKind::BadSyncText))? - .text_or_none(); + .map_err(|_| Id3v2Error::new(Id3v2ErrorKind::BadSyncText))?; let mut endianness: fn([u8; 2]) -> u16 = u16::from_le_bytes; // It's possible for the description to be the only string with a BOM // To be safe, we change the encoding to the concrete variant determined from the description if encoding == TextEncoding::UTF16 { - endianness = match cursor.get_ref()[..=1] { + endianness = match bom { [0xFF, 0xFE] => u16::from_le_bytes, [0xFE, 0xFF] => u16::from_be_bytes, // Since the description was already read, we can assume the BOM was valid - _ => unreachable!(), + _ => unreachable!("Bad BOM {bom:?}"), }; } @@ -174,24 +177,14 @@ impl SynchronizedTextFrame<'_> { let mut content = Vec::new(); while pos < total { - let text = (|| -> Result { - if encoding == TextEncoding::UTF16 { - // Check for a BOM - let mut bom = [0; 2]; - cursor - .read_exact(&mut bom) + let text; + if encoding == TextEncoding::UTF16 { + let (decoded, bytes_read) = + utf16_decode_terminated_maybe_bom(&mut cursor, endianness) .map_err(|_| Id3v2Error::new(Id3v2ErrorKind::BadSyncText))?; - - cursor.seek(SeekFrom::Current(-2))?; - - // Encountered text that doesn't include a BOM - if bom != [0xFF, 0xFE] && bom != [0xFE, 0xFF] { - let (raw_text, _) = read_to_terminator(&mut cursor, TextEncoding::UTF16); - return utf16_decode_bytes(&raw_text, endianness) - .map_err(|_| Id3v2Error::new(Id3v2ErrorKind::BadSyncText).into()); - } - } - + pos += bytes_read as u64; + text = decoded; + } else { let decoded_text = decode_text( &mut cursor, TextDecodeOptions::new().encoding(encoding).terminated(true), @@ -199,8 +192,8 @@ impl SynchronizedTextFrame<'_> { .map_err(|_| Id3v2Error::new(Id3v2ErrorKind::BadSyncText))?; pos += decoded_text.bytes_read as u64; - Ok(decoded_text.content) - })()?; + text = decoded_text.content; + } let time = cursor .read_u32::() @@ -217,7 +210,11 @@ impl SynchronizedTextFrame<'_> { language, timestamp_format, content_type, - description, + description: if description.is_empty() { + None + } else { + Some(description) + }, content, }) } diff --git a/lofty/src/util/text.rs b/lofty/src/util/text.rs index a482b1ba7..f1fd7e201 100644 --- a/lofty/src/util/text.rs +++ b/lofty/src/util/text.rs @@ -160,22 +160,15 @@ where err!(TextDecode("UTF-16 string has an odd length")); } - let bom_to_check; if options.bom == [0, 0] { - bom_to_check = [raw_bytes[0], raw_bytes[1]]; + bom = [raw_bytes[0], raw_bytes[1]]; } else { - bom_to_check = options.bom; + bom = options.bom; } - match bom_to_check { - [0xFE, 0xFF] => { - bom = [0xFE, 0xFF]; - utf16_decode_bytes(&raw_bytes[2..], u16::from_be_bytes)? - }, - [0xFF, 0xFE] => { - bom = [0xFF, 0xFE]; - utf16_decode_bytes(&raw_bytes[2..], u16::from_le_bytes)? - }, + match bom { + [0xFE, 0xFF] => utf16_decode_bytes(&raw_bytes[2..], u16::from_be_bytes)?, + [0xFF, 0xFE] => utf16_decode_bytes(&raw_bytes[2..], u16::from_le_bytes)?, _ => err!(TextDecode("UTF-16 string has an invalid byte order mark")), } }, @@ -184,10 +177,6 @@ where .map_err(|_| LoftyError::new(ErrorKind::TextDecode("Expected a UTF-8 string")))?, }; - if read_string.is_empty() { - return Ok(EMPTY_DECODED_TEXT); - } - Ok(DecodeTextResult { content: read_string, bytes_read, @@ -278,6 +267,36 @@ pub(crate) fn utf16_decode_bytes(bytes: &[u8], endianness: fn([u8; 2]) -> u16) - utf16_decode(&unverified) } +// TODO: Can probably just be merged into an option on `TextDecodeOptions` +/// Read a null-terminated UTF-16 string that may or may not have a BOM +/// +/// This is needed for ID3v2, as some encoders will encode *only* the first string in a frame with a BOM, +/// and the others are assumed to have the same byte order. +/// +/// This is seen in frames like SYLT, COMM, and USLT, where the description will be the only string +/// with a BOM. +/// +/// If no BOM is present, the string will be decoded using `endianness`. +pub(crate) fn utf16_decode_terminated_maybe_bom( + reader: &mut R, + endianness: fn([u8; 2]) -> u16, +) -> Result<(String, usize)> +where + R: Read, +{ + let (raw_text, terminator_len) = read_to_terminator(reader, TextEncoding::UTF16); + + let bytes_read = raw_text.len() + terminator_len; + let decoded; + match &*raw_text { + [0xFF, 0xFE, ..] => decoded = utf16_decode_bytes(&raw_text[2..], u16::from_le_bytes), + [0xFE, 0xFF, ..] => decoded = utf16_decode_bytes(&raw_text[2..], u16::from_be_bytes), + _ => decoded = utf16_decode_bytes(&raw_text, endianness), + } + + decoded.map(|d| (d, bytes_read)) +} + pub(crate) fn encode_text(text: &str, text_encoding: TextEncoding, terminated: bool) -> Vec { match text_encoding { TextEncoding::Latin1 => {