diff --git a/crates/feedparser-rs-core/src/http/client.rs b/crates/feedparser-rs-core/src/http/client.rs index fbb6898..b77b814 100644 --- a/crates/feedparser-rs-core/src/http/client.rs +++ b/crates/feedparser-rs-core/src/http/client.rs @@ -3,7 +3,8 @@ use super::validation::validate_url; use crate::error::{FeedError, Result}; use reqwest::blocking::{Client, Response}; use reqwest::header::{ - ACCEPT, ACCEPT_ENCODING, HeaderMap, HeaderValue, IF_MODIFIED_SINCE, IF_NONE_MATCH, USER_AGENT, + ACCEPT, ACCEPT_ENCODING, HeaderMap, HeaderName, HeaderValue, IF_MODIFIED_SINCE, IF_NONE_MATCH, + USER_AGENT, }; use std::collections::HashMap; use std::time::Duration; @@ -63,6 +64,25 @@ impl FeedHttpClient { self } + /// Insert header with consistent error handling + /// + /// Helper method to reduce boilerplate in header insertion. + #[inline] + fn insert_header( + headers: &mut HeaderMap, + name: HeaderName, + value: &str, + field_name: &str, + ) -> Result<()> { + headers.insert( + name, + HeaderValue::from_str(value).map_err(|e| FeedError::Http { + message: format!("Invalid {field_name}: {e}"), + })?, + ); + Ok(()) + } + /// Fetches a feed from the given URL /// /// Supports conditional GET with `ETag` and `Last-Modified` headers. @@ -91,12 +111,7 @@ impl FeedHttpClient { let mut headers = HeaderMap::new(); // Standard headers - headers.insert( - USER_AGENT, - HeaderValue::from_str(&self.user_agent).map_err(|e| FeedError::Http { - message: format!("Invalid User-Agent: {e}"), - })?, - ); + Self::insert_header(&mut headers, USER_AGENT, &self.user_agent, "User-Agent")?; headers.insert( ACCEPT, @@ -112,21 +127,16 @@ impl FeedHttpClient { // Conditional GET headers if let Some(etag_val) = etag { - headers.insert( - IF_NONE_MATCH, - HeaderValue::from_str(etag_val).map_err(|e| FeedError::Http { - message: format!("Invalid ETag: {e}"), - })?, - ); + Self::insert_header(&mut headers, IF_NONE_MATCH, etag_val, "ETag")?; } if let Some(modified_val) = modified { - headers.insert( + Self::insert_header( + &mut headers, IF_MODIFIED_SINCE, - HeaderValue::from_str(modified_val).map_err(|e| FeedError::Http { - message: format!("Invalid Last-Modified: {e}"), - })?, - ); + modified_val, + "Last-Modified", + )?; } // Merge extra headers @@ -266,4 +276,46 @@ mod tests { let err_msg = result.err().unwrap().to_string(); assert!(err_msg.contains("Internal domain TLD not allowed")); } + + #[test] + fn test_insert_header_valid() { + let mut headers = HeaderMap::new(); + let result = + FeedHttpClient::insert_header(&mut headers, USER_AGENT, "TestBot/1.0", "User-Agent"); + assert!(result.is_ok()); + assert_eq!(headers.get(USER_AGENT).unwrap(), "TestBot/1.0"); + } + + #[test] + fn test_insert_header_invalid_value() { + let mut headers = HeaderMap::new(); + // Invalid header value with control characters + let result = FeedHttpClient::insert_header( + &mut headers, + USER_AGENT, + "Invalid\nHeader", + "User-Agent", + ); + assert!(result.is_err()); + match result { + Err(FeedError::Http { message }) => { + assert!(message.contains("Invalid User-Agent")); + } + _ => panic!("Expected Http error"), + } + } + + #[test] + fn test_insert_header_multiple_headers() { + let mut headers = HeaderMap::new(); + + FeedHttpClient::insert_header(&mut headers, USER_AGENT, "TestBot/1.0", "User-Agent") + .unwrap(); + + FeedHttpClient::insert_header(&mut headers, ACCEPT, "application/xml", "Accept").unwrap(); + + assert_eq!(headers.len(), 2); + assert_eq!(headers.get(USER_AGENT).unwrap(), "TestBot/1.0"); + assert_eq!(headers.get(ACCEPT).unwrap(), "application/xml"); + } } diff --git a/crates/feedparser-rs-core/src/namespace/dublin_core.rs b/crates/feedparser-rs-core/src/namespace/dublin_core.rs index 1210eb7..e07d313 100644 --- a/crates/feedparser-rs-core/src/namespace/dublin_core.rs +++ b/crates/feedparser-rs-core/src/namespace/dublin_core.rs @@ -39,11 +39,7 @@ pub fn handle_feed_element(element: &str, text: &str, feed: &mut FeedMeta) { // Store in dc_creator field feed.dc_creator = Some(text.to_string()); // Also add to authors list - feed.authors.push(Person { - name: Some(text.to_string()), - email: None, - uri: None, - }); + feed.authors.push(Person::from_name(text)); } "date" => { // dc:date → updated (if not already set) @@ -55,11 +51,7 @@ pub fn handle_feed_element(element: &str, text: &str, feed: &mut FeedMeta) { } "subject" => { // dc:subject → tags - feed.tags.push(Tag { - term: text.to_string(), - scheme: None, - label: None, - }); + feed.tags.push(Tag::new(text)); } "description" => { // dc:description → subtitle (if not already set) @@ -101,11 +93,7 @@ pub fn handle_feed_element(element: &str, text: &str, feed: &mut FeedMeta) { } "contributor" => { // dc:contributor → contributors - feed.contributors.push(Person { - name: Some(text.to_string()), - email: None, - uri: None, - }); + feed.contributors.push(Person::from_name(text)); } _ => { // Ignore unknown DC elements (source, type, format, coverage, etc.) @@ -127,11 +115,7 @@ pub fn handle_entry_element(element: &str, text: &str, entry: &mut Entry) { entry.author = Some(text.to_string()); } entry.dc_creator = Some(text.to_string()); - entry.authors.push(Person { - name: Some(text.to_string()), - email: None, - uri: None, - }); + entry.authors.push(Person::from_name(text)); } "date" => { if let Some(dt) = parse_date(text) { @@ -144,11 +128,7 @@ pub fn handle_entry_element(element: &str, text: &str, entry: &mut Entry) { } "subject" => { entry.dc_subject.push(text.to_string()); - entry.tags.push(Tag { - term: text.to_string(), - scheme: None, - label: None, - }); + entry.tags.push(Tag::new(text)); } "description" => { if entry.summary.is_none() { @@ -166,11 +146,7 @@ pub fn handle_entry_element(element: &str, text: &str, entry: &mut Entry) { } } "contributor" => { - entry.contributors.push(Person { - name: Some(text.to_string()), - email: None, - uri: None, - }); + entry.contributors.push(Person::from_name(text)); } "rights" => { entry.dc_rights = Some(text.to_string()); diff --git a/crates/feedparser-rs-core/src/namespace/media_rss.rs b/crates/feedparser-rs-core/src/namespace/media_rss.rs index 4802176..6f27399 100644 --- a/crates/feedparser-rs-core/src/namespace/media_rss.rs +++ b/crates/feedparser-rs-core/src/namespace/media_rss.rs @@ -46,21 +46,13 @@ pub fn handle_entry_element(element: &str, text: &str, entry: &mut Entry) { for keyword in text.split(',') { let keyword = keyword.trim(); if !keyword.is_empty() { - entry.tags.push(Tag { - term: keyword.to_string(), - scheme: None, - label: None, - }); + entry.tags.push(Tag::new(keyword)); } } } "category" => { if !text.is_empty() { - entry.tags.push(Tag { - term: text.to_string(), - scheme: None, - label: None, - }); + entry.tags.push(Tag::new(text)); } } _ => { diff --git a/crates/feedparser-rs-core/src/parser/atom.rs b/crates/feedparser-rs-core/src/parser/atom.rs index 7ea2edf..36b4071 100644 --- a/crates/feedparser-rs-core/src/parser/atom.rs +++ b/crates/feedparser-rs-core/src/parser/atom.rs @@ -119,8 +119,7 @@ fn parse_feed_element( match element.name().as_ref() { b"title" if !is_empty => { let text = parse_text_construct(reader, &mut buf, &element, limits)?; - feed.feed.title = Some(text.value.clone()); - feed.feed.title_detail = Some(text); + feed.feed.set_title(text); } b"link" => { if let Some(link) = Link::from_attributes( @@ -141,8 +140,7 @@ fn parse_feed_element( } b"subtitle" if !is_empty => { let text = parse_text_construct(reader, &mut buf, &element, limits)?; - feed.feed.subtitle = Some(text.value.clone()); - feed.feed.subtitle_detail = Some(text); + feed.feed.set_subtitle(text); } b"id" if !is_empty => { feed.feed.id = Some(read_text(reader, &mut buf, limits)?); @@ -154,8 +152,7 @@ fn parse_feed_element( b"author" if !is_empty => { if let Ok(person) = parse_person(reader, &mut buf, limits, depth) { if feed.feed.author.is_none() { - feed.feed.author.clone_from(&person.name); - feed.feed.author_detail = Some(person.clone()); + feed.feed.set_author(person.clone()); } feed.feed .authors @@ -182,8 +179,7 @@ fn parse_feed_element( } b"generator" if !is_empty => { let generator = parse_generator(reader, &mut buf, &element, limits)?; - feed.feed.generator = Some(generator.value.clone()); - feed.feed.generator_detail = Some(generator); + feed.feed.set_generator(generator); } b"icon" if !is_empty => { feed.feed.icon = Some(read_text(reader, &mut buf, limits)?); @@ -193,16 +189,10 @@ fn parse_feed_element( } b"rights" if !is_empty => { let text = parse_text_construct(reader, &mut buf, &element, limits)?; - feed.feed.rights = Some(text.value.clone()); - feed.feed.rights_detail = Some(text); + feed.feed.set_rights(text); } b"entry" if !is_empty => { - if feed.entries.is_at_limit(limits.max_entries) { - feed.bozo = true; - feed.bozo_exception = - Some(format!("Entry limit exceeded: {}", limits.max_entries)); - skip_element(reader, &mut buf, limits, *depth)?; - *depth = depth.saturating_sub(1); + if !feed.check_entry_limit(reader, &mut buf, limits, depth)? { continue; } @@ -288,8 +278,7 @@ fn parse_entry( match element.name().as_ref() { b"title" if !is_empty => { let text = parse_text_construct(reader, buf, &element, limits)?; - entry.title = Some(text.value.clone()); - entry.title_detail = Some(text); + entry.set_title(text); } b"link" => { if let Some(link) = Link::from_attributes( @@ -320,8 +309,7 @@ fn parse_entry( } b"summary" if !is_empty => { let text = parse_text_construct(reader, buf, &element, limits)?; - entry.summary = Some(text.value.clone()); - entry.summary_detail = Some(text); + entry.set_summary(text); } b"content" if !is_empty => { let content = parse_content(reader, buf, &element, limits)?; @@ -332,8 +320,7 @@ fn parse_entry( b"author" if !is_empty => { if let Ok(person) = parse_person(reader, buf, limits, depth) { if entry.author.is_none() { - entry.author.clone_from(&person.name); - entry.author_detail = Some(person.clone()); + entry.set_author(person.clone()); } entry.authors.try_push_limited(person, limits.max_authors); } diff --git a/crates/feedparser-rs-core/src/parser/common.rs b/crates/feedparser-rs-core/src/parser/common.rs index 3696fb5..37097ee 100644 --- a/crates/feedparser-rs-core/src/parser/common.rs +++ b/crates/feedparser-rs-core/src/parser/common.rs @@ -11,6 +11,7 @@ use crate::{ use quick_xml::{Reader, events::Event}; pub use crate::types::{FromAttributes, LimitedCollectionExt}; +pub use crate::util::text::bytes_to_string; /// Initial capacity for XML event buffer (fits most elements) pub const EVENT_BUFFER_CAPACITY: usize = 1024; @@ -18,6 +19,56 @@ pub const EVENT_BUFFER_CAPACITY: usize = 1024; /// Initial capacity for text content (typical field size) pub const TEXT_BUFFER_CAPACITY: usize = 256; +/// Creates a new event buffer with optimized capacity +/// +/// This factory function provides a semantic way to create XML event buffers +/// with consistent capacity across all parsers. Using this instead of direct +/// `Vec::with_capacity()` calls makes it easier to tune buffer sizes in one place. +/// +/// # Returns +/// +/// A `Vec` pre-allocated with `EVENT_BUFFER_CAPACITY` (1024 bytes) +/// +/// # Examples +/// +/// ```ignore +/// use feedparser_rs_core::parser::common::new_event_buffer; +/// +/// let mut buf = new_event_buffer(); +/// assert!(buf.capacity() >= 1024); +/// ``` +#[inline] +#[must_use] +#[allow(dead_code)] // Future use: Will be adopted when refactoring parsers +pub fn new_event_buffer() -> Vec { + Vec::with_capacity(EVENT_BUFFER_CAPACITY) +} + +/// Creates a new text buffer with optimized capacity +/// +/// This factory function provides a semantic way to create text content buffers +/// with consistent capacity across all parsers. Useful for accumulating text +/// content from XML elements. +/// +/// # Returns +/// +/// A `String` pre-allocated with `TEXT_BUFFER_CAPACITY` (256 bytes) +/// +/// # Examples +/// +/// ```ignore +/// use feedparser_rs_core::parser::common::new_text_buffer; +/// +/// let mut text = new_text_buffer(); +/// assert!(text.capacity() >= 256); +/// ``` +#[inline] +#[must_use] +#[allow(dead_code)] // Future use: Will be adopted when refactoring parsers +pub fn new_text_buffer() -> String { + String::with_capacity(TEXT_BUFFER_CAPACITY) +} + /// Context for parsing operations /// /// Bundles together common parsing state to reduce function parameter count. @@ -106,18 +157,6 @@ pub fn check_depth(depth: usize, max_depth: usize) -> Result<()> { Ok(()) } -/// Efficient string conversion from bytes - zero-copy for valid UTF-8 -/// -/// Uses `std::str::from_utf8()` for zero-copy conversion when the input -/// is valid UTF-8, falling back to lossy conversion otherwise. -#[inline] -pub fn bytes_to_string(value: &[u8]) -> String { - std::str::from_utf8(value).map_or_else( - |_| String::from_utf8_lossy(value).into_owned(), - std::string::ToString::to_string, - ) -} - /// Read text content from current XML element (handles text and CDATA) pub fn read_text( reader: &mut Reader<&[u8]>, diff --git a/crates/feedparser-rs-core/src/parser/json.rs b/crates/feedparser-rs-core/src/parser/json.rs index 7d6f43a..9d6aeb8 100644 --- a/crates/feedparser-rs-core/src/parser/json.rs +++ b/crates/feedparser-rs-core/src/parser/json.rs @@ -77,8 +77,7 @@ pub fn parse_json_feed_with_limits(data: &[u8], limits: ParserLimits) -> Result< fn parse_feed_metadata(json: &Value, feed: &mut FeedMeta, limits: &ParserLimits) { if let Some(title) = json.get("title").and_then(|v| v.as_str()) { let truncated = truncate_text(title, limits.max_text_length); - feed.title_detail = Some(TextConstruct::text(&truncated)); - feed.title = Some(truncated); + feed.set_title(TextConstruct::text(&truncated)); } if let Some(url) = json.get("home_page_url").and_then(|v| v.as_str()) @@ -162,8 +161,7 @@ fn parse_item(json: &Value, limits: &ParserLimits) -> Entry { if let Some(title) = json.get("title").and_then(|v| v.as_str()) { let truncated = truncate_text(title, limits.max_text_length); - entry.title_detail = Some(TextConstruct::text(&truncated)); - entry.title = Some(truncated); + entry.set_title(TextConstruct::text(&truncated)); } if let Some(content_html) = json.get("content_html").and_then(|v| v.as_str()) { @@ -182,8 +180,7 @@ fn parse_item(json: &Value, limits: &ParserLimits) -> Entry { if let Some(summary) = json.get("summary").and_then(|v| v.as_str()) { let truncated = truncate_text(summary, limits.max_text_length); - entry.summary_detail = Some(TextConstruct::text(&truncated)); - entry.summary = Some(truncated); + entry.set_summary(TextConstruct::text(&truncated)); } if let Some(image) = json.get("image").and_then(|v| v.as_str()) { diff --git a/crates/feedparser-rs-core/src/parser/mod.rs b/crates/feedparser-rs-core/src/parser/mod.rs index 419c059..35cbb80 100644 --- a/crates/feedparser-rs-core/src/parser/mod.rs +++ b/crates/feedparser-rs-core/src/parser/mod.rs @@ -2,10 +2,12 @@ pub mod atom; mod common; mod detect; pub mod json; +pub mod namespace_detection; pub mod rss; use crate::{error::Result, types::ParsedFeed}; +pub use common::skip_element; pub use detect::detect_format; /// Parse feed from raw bytes diff --git a/crates/feedparser-rs-core/src/parser/namespace_detection.rs b/crates/feedparser-rs-core/src/parser/namespace_detection.rs new file mode 100644 index 0000000..2912ebf --- /dev/null +++ b/crates/feedparser-rs-core/src/parser/namespace_detection.rs @@ -0,0 +1,227 @@ +//! Namespace tag detection utilities +//! +//! This module provides efficient namespace prefix matching for XML elements. +//! Instead of duplicating `is_dc_tag()`, `is_content_tag()`, etc. across parsers, +//! we use a single `NamespacePrefix` abstraction. +//! +//! # Examples +//! +//! ```ignore +//! use feedparser_rs_core::parser::namespace_detection::namespaces; +//! +//! let tag_name = b"dc:creator"; +//! if let Some(element) = namespaces::DC.matches(tag_name) { +//! assert_eq!(element, "creator"); +//! } +//! ``` + +/// Namespace prefix configuration for efficient tag matching +/// +/// This struct stores a namespace prefix (e.g., `"dc:"`) and provides +/// zero-cost matching against tag names. It uses `const fn` construction +/// for compile-time initialization. +#[derive(Debug, Clone, Copy)] +#[allow(dead_code)] // Future use: Will be adopted when consolidating namespace detection +pub struct NamespacePrefix { + prefix: &'static [u8], + prefix_len: usize, +} + +impl NamespacePrefix { + /// Creates a new namespace prefix matcher + /// + /// # Examples + /// + /// ```ignore + /// use feedparser_rs_core::parser::namespace_detection::NamespacePrefix; + /// + /// const CUSTOM: NamespacePrefix = NamespacePrefix::new("custom:"); + /// ``` + #[must_use] + #[allow(dead_code)] // Future use + pub const fn new(prefix: &'static str) -> Self { + let prefix_bytes = prefix.as_bytes(); + Self { + prefix: prefix_bytes, + prefix_len: prefix_bytes.len(), + } + } + + /// Check if tag name matches this namespace prefix + /// + /// Returns the element name after the prefix if matched, or `None` if + /// the tag doesn't start with this prefix. + /// + /// # Arguments + /// + /// * `tag_name` - The full tag name (e.g., `b"dc:creator"`) + /// + /// # Returns + /// + /// * `Some(element)` - Element name after prefix (e.g., `"creator"`) + /// * `None` - Tag doesn't match this prefix + /// + /// # Examples + /// + /// ```ignore + /// use feedparser_rs_core::parser::namespace_detection::namespaces; + /// + /// assert_eq!(namespaces::DC.matches(b"dc:creator"), Some("creator")); + /// assert_eq!(namespaces::DC.matches(b"content:encoded"), None); + /// assert_eq!(namespaces::DC.matches(b"dc:"), Some("")); // Empty element name + /// ``` + #[inline] + #[must_use] + #[allow(dead_code)] // Future use + pub fn matches<'a>(&self, tag_name: &'a [u8]) -> Option<&'a str> { + if tag_name.starts_with(self.prefix) { + std::str::from_utf8(&tag_name[self.prefix_len..]).ok() + } else { + None + } + } + + /// Returns the prefix string (e.g., `"dc:"`) + /// + /// # Safety + /// + /// This function uses `unsafe` because `std::str::from_utf8` is not + /// yet `const fn` stable. The safety invariant is guaranteed by: + /// + /// 1. `new()` only accepts `&'static str` (compile-time valid UTF-8) + /// 2. `as_bytes()` is a reversible, safe transformation + /// 3. The field is private and immutable - no external mutation + /// 4. All instances are const-initialized with string literals + /// + /// # Examples + /// + /// ```ignore + /// use feedparser_rs_core::parser::namespace_detection::namespaces; + /// + /// assert_eq!(namespaces::DC.prefix(), "dc:"); + /// ``` + #[inline] + #[must_use] + #[allow(dead_code)] // Future use + pub const fn prefix(&self) -> &'static str { + // SAFETY: prefix is always constructed from &'static str in new(), + // which guarantees valid UTF-8. The field is private and immutable, + // so no external code can violate this invariant. + #[allow(unsafe_code)] + unsafe { + std::str::from_utf8_unchecked(self.prefix) + } + } +} + +/// Common namespace prefixes used in RSS/Atom feeds +/// +/// These constants provide efficient namespace detection across all parsers. +/// Each constant uses `const fn` construction for zero runtime cost. +/// +/// # Available Namespaces +/// +/// - `DC` - Dublin Core (`dc:`) +/// - `CONTENT` - RSS Content Module (`content:`) +/// - `MEDIA` - Media RSS (`media:`) +/// - `ITUNES` - iTunes Podcast (`itunes:`) +/// - `PODCAST` - Podcast 2.0 (`podcast:`) +/// +/// # Examples +/// +/// ```ignore +/// use feedparser_rs_core::parser::namespace_detection::namespaces; +/// +/// let tag = b"itunes:author"; +/// if let Some(element) = namespaces::ITUNES.matches(tag) { +/// println!("iTunes element: {element}"); +/// } +/// ``` +pub mod namespaces { + use super::NamespacePrefix; + + /// Dublin Core namespace prefix (`dc:`) + /// + /// Common elements: `creator`, `publisher`, `rights`, `date`, `identifier` + #[allow(dead_code)] // Future use + pub const DC: NamespacePrefix = NamespacePrefix::new("dc:"); + + /// RSS Content Module namespace prefix (`content:`) + /// + /// Common elements: `encoded` + #[allow(dead_code)] // Future use + pub const CONTENT: NamespacePrefix = NamespacePrefix::new("content:"); + + /// Media RSS namespace prefix (`media:`) + /// + /// Common elements: `content`, `thumbnail`, `description`, `keywords` + #[allow(dead_code)] // Future use + pub const MEDIA: NamespacePrefix = NamespacePrefix::new("media:"); + + /// iTunes Podcast namespace prefix (`itunes:`) + /// + /// Common elements: `author`, `summary`, `explicit`, `category`, `image` + #[allow(dead_code)] // Future use + pub const ITUNES: NamespacePrefix = NamespacePrefix::new("itunes:"); + + /// Podcast 2.0 namespace prefix (`podcast:`) + /// + /// Common elements: `transcript`, `chapters`, `soundbite`, `person` + #[allow(dead_code)] // Future use + pub const PODCAST: NamespacePrefix = NamespacePrefix::new("podcast:"); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_namespace_prefix_matches() { + assert_eq!(namespaces::DC.matches(b"dc:creator"), Some("creator")); + assert_eq!(namespaces::DC.matches(b"dc:publisher"), Some("publisher")); + assert_eq!( + namespaces::CONTENT.matches(b"content:encoded"), + Some("encoded") + ); + assert_eq!( + namespaces::MEDIA.matches(b"media:thumbnail"), + Some("thumbnail") + ); + } + + #[test] + fn test_namespace_prefix_no_match() { + assert_eq!(namespaces::DC.matches(b"content:encoded"), None); + assert_eq!(namespaces::CONTENT.matches(b"dc:creator"), None); + assert_eq!(namespaces::MEDIA.matches(b"itunes:author"), None); + } + + #[test] + fn test_namespace_prefix_empty_element() { + // Edge case: prefix matches but no element name + assert_eq!(namespaces::DC.matches(b"dc:"), Some("")); + } + + #[test] + fn test_namespace_prefix_invalid_utf8() { + // Invalid UTF-8 after prefix should return None + let invalid = b"dc:\xFF\xFE"; + assert_eq!(namespaces::DC.matches(invalid), None); + } + + #[test] + fn test_namespace_prefix_getter() { + assert_eq!(namespaces::DC.prefix(), "dc:"); + assert_eq!(namespaces::CONTENT.prefix(), "content:"); + assert_eq!(namespaces::MEDIA.prefix(), "media:"); + assert_eq!(namespaces::ITUNES.prefix(), "itunes:"); + assert_eq!(namespaces::PODCAST.prefix(), "podcast:"); + } + + #[test] + fn test_custom_namespace() { + const CUSTOM: NamespacePrefix = NamespacePrefix::new("custom:"); + assert_eq!(CUSTOM.matches(b"custom:field"), Some("field")); + assert_eq!(CUSTOM.matches(b"other:field"), None); + } +} diff --git a/crates/feedparser-rs-core/src/parser/rss.rs b/crates/feedparser-rs-core/src/parser/rss.rs index 88d944d..7b024ef 100644 --- a/crates/feedparser-rs-core/src/parser/rss.rs +++ b/crates/feedparser-rs-core/src/parser/rss.rs @@ -172,12 +172,7 @@ fn parse_channel( } } b"item" => { - if feed.entries.is_at_limit(limits.max_entries) { - feed.bozo = true; - feed.bozo_exception = - Some(format!("Entry limit exceeded: {}", limits.max_entries)); - skip_element(reader, &mut buf, limits, *depth)?; - *depth = depth.saturating_sub(1); + if !feed.check_entry_limit(reader, &mut buf, limits, depth)? { continue; } diff --git a/crates/feedparser-rs-core/src/types/common.rs b/crates/feedparser-rs-core/src/types/common.rs index 1eebe12..c82a208 100644 --- a/crates/feedparser-rs-core/src/types/common.rs +++ b/crates/feedparser-rs-core/src/types/common.rs @@ -1,15 +1,7 @@ use super::generics::{FromAttributes, ParseFrom}; +use crate::util::text::bytes_to_string; use serde_json::Value; -/// Helper for efficient bytes to string conversion -#[inline] -fn bytes_to_string(value: &[u8]) -> String { - std::str::from_utf8(value).map_or_else( - |_| String::from_utf8_lossy(value).into_owned(), - std::string::ToString::to_string, - ) -} - /// Link in feed or entry #[derive(Debug, Clone, Default)] pub struct Link { @@ -99,6 +91,29 @@ pub struct Person { pub uri: Option, } +impl Person { + /// Create person from just a name + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs_core::types::Person; + /// + /// let person = Person::from_name("John Doe"); + /// assert_eq!(person.name.as_deref(), Some("John Doe")); + /// assert!(person.email.is_none()); + /// assert!(person.uri.is_none()); + /// ``` + #[inline] + pub fn from_name(name: impl Into) -> Self { + Self { + name: Some(name.into()), + email: None, + uri: None, + } + } +} + /// Tag/category #[derive(Debug, Clone)] pub struct Tag { diff --git a/crates/feedparser-rs-core/src/types/entry.rs b/crates/feedparser-rs-core/src/types/entry.rs index 0ebe581..fc59275 100644 --- a/crates/feedparser-rs-core/src/types/entry.rs +++ b/crates/feedparser-rs-core/src/types/entry.rs @@ -101,6 +101,74 @@ impl Entry { ..Default::default() } } + + /// Sets title field with `TextConstruct`, storing both simple and detailed versions + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs_core::{Entry, TextConstruct}; + /// + /// let mut entry = Entry::default(); + /// entry.set_title(TextConstruct::text("Great Article")); + /// assert_eq!(entry.title.as_deref(), Some("Great Article")); + /// ``` + #[inline] + pub fn set_title(&mut self, mut text: TextConstruct) { + self.title = Some(std::mem::take(&mut text.value)); + self.title_detail = Some(text); + } + + /// Sets summary field with `TextConstruct`, storing both simple and detailed versions + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs_core::{Entry, TextConstruct}; + /// + /// let mut entry = Entry::default(); + /// entry.set_summary(TextConstruct::text("A summary")); + /// assert_eq!(entry.summary.as_deref(), Some("A summary")); + /// ``` + #[inline] + pub fn set_summary(&mut self, mut text: TextConstruct) { + self.summary = Some(std::mem::take(&mut text.value)); + self.summary_detail = Some(text); + } + + /// Sets author field with `Person`, storing both simple and detailed versions + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs_core::{Entry, Person}; + /// + /// let mut entry = Entry::default(); + /// entry.set_author(Person::from_name("Jane Doe")); + /// assert_eq!(entry.author.as_deref(), Some("Jane Doe")); + /// ``` + #[inline] + pub fn set_author(&mut self, mut person: Person) { + self.author = person.name.take(); + self.author_detail = Some(person); + } + + /// Sets publisher field with `Person`, storing both simple and detailed versions + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs_core::{Entry, Person}; + /// + /// let mut entry = Entry::default(); + /// entry.set_publisher(Person::from_name("ACME Corp")); + /// assert_eq!(entry.publisher.as_deref(), Some("ACME Corp")); + /// ``` + #[inline] + pub fn set_publisher(&mut self, mut person: Person) { + self.publisher = person.name.take(); + self.publisher_detail = Some(person); + } } #[cfg(test)] diff --git a/crates/feedparser-rs-core/src/types/feed.rs b/crates/feedparser-rs-core/src/types/feed.rs index 2d8d833..afb8923 100644 --- a/crates/feedparser-rs-core/src/types/feed.rs +++ b/crates/feedparser-rs-core/src/types/feed.rs @@ -1,10 +1,13 @@ use super::{ common::{Generator, Image, Link, Person, Tag, TextConstruct}, entry::Entry, + generics::LimitedCollectionExt, podcast::{ItunesFeedMeta, PodcastMeta}, version::FeedVersion, }; +use crate::{ParserLimits, error::Result}; use chrono::{DateTime, Utc}; +use quick_xml::Reader; use std::collections::HashMap; /// Feed metadata @@ -139,6 +142,63 @@ impl ParsedFeed { ..Default::default() } } + + /// Check if entry limit is reached, set bozo flag and skip element if so + /// + /// This helper consolidates the duplicate entry limit checking logic used in + /// RSS and Atom parsers. If the entry limit is reached, it: + /// - Sets `bozo` flag to true + /// - Sets `bozo_exception` with descriptive error message + /// - Skips the entry element + /// - Returns `Ok(false)` to signal that the entry should not be processed + /// + /// # Arguments + /// + /// * `reader` - XML reader positioned at the entry element + /// * `buf` - Buffer for XML event reading + /// * `limits` - Parser limits including `max_entries` + /// * `depth` - Current nesting depth (will be decremented) + /// + /// # Returns + /// + /// * `Ok(true)` - Entry can be processed (limit not reached) + /// * `Ok(false)` - Entry limit reached, element was skipped + /// + /// # Errors + /// + /// Returns an error if: + /// - Skipping the entry element fails (e.g., malformed XML) + /// - Nesting depth exceeds limits while skipping + /// + /// # Examples + /// + /// ```ignore + /// // In parser: + /// if !feed.check_entry_limit(reader, &mut buf, limits, depth)? { + /// continue; + /// } + /// // Process entry... + /// ``` + #[inline] + pub fn check_entry_limit( + &mut self, + reader: &mut Reader<&[u8]>, + buf: &mut Vec, + limits: &ParserLimits, + depth: &mut usize, + ) -> Result { + use crate::parser::skip_element; + + if self.entries.is_at_limit(limits.max_entries) { + self.bozo = true; + self.bozo_exception = Some(format!("Entry limit exceeded: {}", limits.max_entries)); + skip_element(reader, buf, limits, *depth)?; + *depth = depth.saturating_sub(1); + Ok(false) + } else { + Ok(true) + } + } } impl FeedMeta { @@ -192,6 +252,115 @@ impl FeedMeta { ..Default::default() } } + + /// Sets title field with `TextConstruct`, storing both simple and detailed versions + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs_core::{FeedMeta, TextConstruct}; + /// + /// let mut meta = FeedMeta::default(); + /// meta.set_title(TextConstruct::text("Example Feed")); + /// assert_eq!(meta.title.as_deref(), Some("Example Feed")); + /// ``` + #[inline] + pub fn set_title(&mut self, mut text: TextConstruct) { + self.title = Some(std::mem::take(&mut text.value)); + self.title_detail = Some(text); + } + + /// Sets subtitle field with `TextConstruct`, storing both simple and detailed versions + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs_core::{FeedMeta, TextConstruct}; + /// + /// let mut meta = FeedMeta::default(); + /// meta.set_subtitle(TextConstruct::text("A great feed")); + /// assert_eq!(meta.subtitle.as_deref(), Some("A great feed")); + /// ``` + #[inline] + pub fn set_subtitle(&mut self, mut text: TextConstruct) { + self.subtitle = Some(std::mem::take(&mut text.value)); + self.subtitle_detail = Some(text); + } + + /// Sets rights field with `TextConstruct`, storing both simple and detailed versions + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs_core::{FeedMeta, TextConstruct}; + /// + /// let mut meta = FeedMeta::default(); + /// meta.set_rights(TextConstruct::text("© 2025 Example")); + /// assert_eq!(meta.rights.as_deref(), Some("© 2025 Example")); + /// ``` + #[inline] + pub fn set_rights(&mut self, mut text: TextConstruct) { + self.rights = Some(std::mem::take(&mut text.value)); + self.rights_detail = Some(text); + } + + /// Sets generator field with `Generator`, storing both simple and detailed versions + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs_core::{FeedMeta, Generator}; + /// + /// # fn main() { + /// let mut meta = FeedMeta::default(); + /// let generator = Generator { + /// value: "Example Generator".to_string(), + /// uri: None, + /// version: None, + /// }; + /// meta.set_generator(generator); + /// assert_eq!(meta.generator.as_deref(), Some("Example Generator")); + /// # } + /// ``` + #[inline] + pub fn set_generator(&mut self, mut generator: Generator) { + self.generator = Some(std::mem::take(&mut generator.value)); + self.generator_detail = Some(generator); + } + + /// Sets author field with `Person`, storing both simple and detailed versions + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs_core::{FeedMeta, Person}; + /// + /// let mut meta = FeedMeta::default(); + /// meta.set_author(Person::from_name("John Doe")); + /// assert_eq!(meta.author.as_deref(), Some("John Doe")); + /// ``` + #[inline] + pub fn set_author(&mut self, mut person: Person) { + self.author = person.name.take(); + self.author_detail = Some(person); + } + + /// Sets publisher field with `Person`, storing both simple and detailed versions + /// + /// # Examples + /// + /// ``` + /// use feedparser_rs_core::{FeedMeta, Person}; + /// + /// let mut meta = FeedMeta::default(); + /// meta.set_publisher(Person::from_name("ACME Corp")); + /// assert_eq!(meta.publisher.as_deref(), Some("ACME Corp")); + /// ``` + #[inline] + pub fn set_publisher(&mut self, mut person: Person) { + self.publisher = person.name.take(); + self.publisher_detail = Some(person); + } } #[cfg(test)] diff --git a/crates/feedparser-rs-core/src/util/text.rs b/crates/feedparser-rs-core/src/util/text.rs index f026144..1563608 100644 --- a/crates/feedparser-rs-core/src/util/text.rs +++ b/crates/feedparser-rs-core/src/util/text.rs @@ -1,6 +1,29 @@ -// Text processing utilities -// -// This module will provide functions for text manipulation, -// such as trimming, normalizing whitespace, etc. +//! Text processing utilities +//! +//! This module provides functions for text manipulation, +//! such as trimming, normalizing whitespace, and encoding conversion. -// TODO: Implement as needed +/// Efficient bytes to string conversion - zero-copy for valid UTF-8 +/// +/// Uses `std::str::from_utf8()` for zero-copy conversion when the input +/// is valid UTF-8, falling back to lossy conversion otherwise. +/// +/// # Examples +/// +/// ``` +/// use feedparser_rs_core::util::text::bytes_to_string; +/// +/// let valid_utf8 = b"Hello, world!"; +/// assert_eq!(bytes_to_string(valid_utf8), "Hello, world!"); +/// +/// let invalid_utf8 = &[0xFF, 0xFE, 0xFD]; +/// let result = bytes_to_string(invalid_utf8); +/// assert!(!result.is_empty()); // Lossy conversion succeeded +/// ``` +#[inline] +pub fn bytes_to_string(value: &[u8]) -> String { + std::str::from_utf8(value).map_or_else( + |_| String::from_utf8_lossy(value).into_owned(), + std::string::ToString::to_string, + ) +}