diff --git a/crates/feedparser-rs-core/src/lib.rs b/crates/feedparser-rs-core/src/lib.rs index 6fbb8b3..a1bc006 100644 --- a/crates/feedparser-rs-core/src/lib.rs +++ b/crates/feedparser-rs-core/src/lib.rs @@ -72,6 +72,8 @@ pub use types::{ TextType, parse_duration, parse_explicit, }; +pub use namespace::syndication::{SyndicationMeta, UpdatePeriod}; + #[cfg(feature = "http")] pub use http::{FeedHttpClient, FeedHttpResponse}; diff --git a/crates/feedparser-rs-core/src/namespace/mod.rs b/crates/feedparser-rs-core/src/namespace/mod.rs index 7056e7d..668bb53 100644 --- a/crates/feedparser-rs-core/src/namespace/mod.rs +++ b/crates/feedparser-rs-core/src/namespace/mod.rs @@ -35,6 +35,8 @@ pub mod dublin_core; pub mod georss; /// Media RSS specification pub mod media_rss; +/// Syndication Module for RSS 1.0 +pub mod syndication; /// Common namespace URIs used in feeds pub mod namespaces { @@ -56,6 +58,9 @@ pub mod namespaces { /// RSS 1.0 pub const RSS_10: &str = "http://purl.org/rss/1.0/"; + /// Syndication Module for RSS 1.0 + pub const SYNDICATION: &str = "http://purl.org/rss/1.0/modules/syndication/"; + /// iTunes Podcast pub const ITUNES: &str = "http://www.itunes.com/dtds/podcast-1.0.dtd"; @@ -88,6 +93,7 @@ pub fn get_namespace_uri(prefix: &str) -> Option<&'static str> { "media" => Some(namespaces::MEDIA), "atom" => Some(namespaces::ATOM), "rdf" => Some(namespaces::RDF), + "syn" | "syndication" => Some(namespaces::SYNDICATION), "itunes" => Some(namespaces::ITUNES), "podcast" => Some(namespaces::PODCAST), "georss" => Some(namespaces::GEORSS), @@ -113,6 +119,7 @@ pub fn get_namespace_prefix(uri: &str) -> Option<&'static str> { namespaces::MEDIA => Some("media"), namespaces::ATOM => Some("atom"), namespaces::RDF => Some("rdf"), + namespaces::SYNDICATION => Some("syn"), namespaces::ITUNES => Some("itunes"), namespaces::PODCAST => Some("podcast"), namespaces::GEORSS => Some("georss"), diff --git a/crates/feedparser-rs-core/src/namespace/syndication.rs b/crates/feedparser-rs-core/src/namespace/syndication.rs new file mode 100644 index 0000000..f98105e --- /dev/null +++ b/crates/feedparser-rs-core/src/namespace/syndication.rs @@ -0,0 +1,211 @@ +/// Syndication Module for RSS 1.0 +/// +/// Namespace: +/// Prefix: syn +/// +/// This module provides parsing support for the Syndication namespace, +/// used in RSS 1.0 feeds to indicate update schedules and frequencies. +/// +/// Elements: +/// - `syn:updatePeriod` → Update period (hourly, daily, weekly, monthly, yearly) +/// - `syn:updateFrequency` → Number of times per period +/// - `syn:updateBase` → Base date for update schedule (ISO 8601) +use crate::types::FeedMeta; + +/// Syndication namespace URI +pub const SYNDICATION_NAMESPACE: &str = "http://purl.org/rss/1.0/modules/syndication/"; + +/// Valid update period values +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum UpdatePeriod { + /// Update hourly + Hourly, + /// Update daily + Daily, + /// Update weekly + Weekly, + /// Update monthly + Monthly, + /// Update yearly + Yearly, +} + +impl UpdatePeriod { + /// Parse update period from string (case-insensitive) + /// + /// Returns `None` if the string doesn't match any valid period. + #[must_use] + pub fn parse(s: &str) -> Option { + match s.to_lowercase().as_str() { + "hourly" => Some(Self::Hourly), + "daily" => Some(Self::Daily), + "weekly" => Some(Self::Weekly), + "monthly" => Some(Self::Monthly), + "yearly" => Some(Self::Yearly), + _ => None, + } + } + + /// Convert to string representation + #[must_use] + pub const fn as_str(&self) -> &'static str { + match self { + Self::Hourly => "hourly", + Self::Daily => "daily", + Self::Weekly => "weekly", + Self::Monthly => "monthly", + Self::Yearly => "yearly", + } + } +} + +/// Syndication metadata +#[derive(Debug, Clone, Default)] +pub struct SyndicationMeta { + /// Update period (hourly, daily, weekly, monthly, yearly) + pub update_period: Option, + /// Number of times updated per period + pub update_frequency: Option, + /// Base date for update schedule (ISO 8601) + pub update_base: Option, +} + +/// Handle Syndication namespace element at feed level +/// +/// # Arguments +/// +/// * `element` - Local name of the element (without namespace prefix) +/// * `text` - Text content of the element +/// * `feed` - Feed metadata to update +pub fn handle_feed_element(element: &str, text: &str, feed: &mut FeedMeta) { + match element { + "updatePeriod" => { + if let Some(period) = UpdatePeriod::parse(text) { + if feed.syndication.is_none() { + feed.syndication = Some(SyndicationMeta::default()); + } + if let Some(syn) = &mut feed.syndication { + syn.update_period = Some(period); + } + } + } + "updateFrequency" => { + if let Ok(freq) = text.parse::() { + if feed.syndication.is_none() { + feed.syndication = Some(SyndicationMeta::default()); + } + if let Some(syn) = &mut feed.syndication { + syn.update_frequency = Some(freq); + } + } + } + "updateBase" => { + if feed.syndication.is_none() { + feed.syndication = Some(SyndicationMeta::default()); + } + if let Some(syn) = &mut feed.syndication { + syn.update_base = Some(text.to_string()); + } + } + _ => { + // Ignore unknown syndication elements + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_update_period_parse() { + assert_eq!(UpdatePeriod::parse("hourly"), Some(UpdatePeriod::Hourly)); + assert_eq!(UpdatePeriod::parse("daily"), Some(UpdatePeriod::Daily)); + assert_eq!(UpdatePeriod::parse("weekly"), Some(UpdatePeriod::Weekly)); + assert_eq!(UpdatePeriod::parse("monthly"), Some(UpdatePeriod::Monthly)); + assert_eq!(UpdatePeriod::parse("yearly"), Some(UpdatePeriod::Yearly)); + assert_eq!(UpdatePeriod::parse("invalid"), None); + } + + #[test] + fn test_update_period_case_insensitive() { + assert_eq!(UpdatePeriod::parse("HOURLY"), Some(UpdatePeriod::Hourly)); + assert_eq!(UpdatePeriod::parse("Daily"), Some(UpdatePeriod::Daily)); + assert_eq!(UpdatePeriod::parse("WeeKLY"), Some(UpdatePeriod::Weekly)); + } + + #[test] + fn test_update_period_as_str() { + assert_eq!(UpdatePeriod::Hourly.as_str(), "hourly"); + assert_eq!(UpdatePeriod::Daily.as_str(), "daily"); + assert_eq!(UpdatePeriod::Weekly.as_str(), "weekly"); + assert_eq!(UpdatePeriod::Monthly.as_str(), "monthly"); + assert_eq!(UpdatePeriod::Yearly.as_str(), "yearly"); + } + + #[test] + fn test_handle_update_period() { + let mut feed = FeedMeta::default(); + + handle_feed_element("updatePeriod", "daily", &mut feed); + + assert!(feed.syndication.is_some()); + let syn = feed.syndication.as_ref().unwrap(); + assert_eq!(syn.update_period, Some(UpdatePeriod::Daily)); + } + + #[test] + fn test_handle_update_frequency() { + let mut feed = FeedMeta::default(); + + handle_feed_element("updateFrequency", "2", &mut feed); + + assert!(feed.syndication.is_some()); + let syn = feed.syndication.as_ref().unwrap(); + assert_eq!(syn.update_frequency, Some(2)); + } + + #[test] + fn test_handle_update_base() { + let mut feed = FeedMeta::default(); + + handle_feed_element("updateBase", "2024-12-18T00:00:00Z", &mut feed); + + assert!(feed.syndication.is_some()); + let syn = feed.syndication.as_ref().unwrap(); + assert_eq!(syn.update_base.as_deref(), Some("2024-12-18T00:00:00Z")); + } + + #[test] + fn test_handle_multiple_elements() { + let mut feed = FeedMeta::default(); + + handle_feed_element("updatePeriod", "hourly", &mut feed); + handle_feed_element("updateFrequency", "1", &mut feed); + handle_feed_element("updateBase", "2024-01-01T00:00:00Z", &mut feed); + + let syn = feed.syndication.as_ref().unwrap(); + assert_eq!(syn.update_period, Some(UpdatePeriod::Hourly)); + assert_eq!(syn.update_frequency, Some(1)); + assert_eq!(syn.update_base.as_deref(), Some("2024-01-01T00:00:00Z")); + } + + #[test] + fn test_handle_invalid_frequency() { + let mut feed = FeedMeta::default(); + + handle_feed_element("updateFrequency", "not-a-number", &mut feed); + + // Should not create syndication metadata for invalid input + assert!(feed.syndication.is_none()); + } + + #[test] + fn test_handle_unknown_element() { + let mut feed = FeedMeta::default(); + + handle_feed_element("unknown", "value", &mut feed); + + assert!(feed.syndication.is_none()); + } +} diff --git a/crates/feedparser-rs-core/src/parser/common.rs b/crates/feedparser-rs-core/src/parser/common.rs index 569c24b..55074d7 100644 --- a/crates/feedparser-rs-core/src/parser/common.rs +++ b/crates/feedparser-rs-core/src/parser/common.rs @@ -209,6 +209,20 @@ pub fn is_content_tag(name: &[u8]) -> Option<&str> { extract_ns_local_name(name, b"content:") } +/// Check if element is a Syndication namespaced tag +/// +/// # Examples +/// +/// ```ignore +/// assert_eq!(is_syn_tag(b"syn:updatePeriod"), Some("updatePeriod")); +/// assert_eq!(is_syn_tag(b"syn:updateFrequency"), Some("updateFrequency")); +/// assert_eq!(is_syn_tag(b"dc:creator"), None); +/// ``` +#[inline] +pub fn is_syn_tag(name: &[u8]) -> Option<&str> { + extract_ns_local_name(name, b"syn:") +} + /// Check if element is a Media RSS namespaced tag /// /// # Examples diff --git a/crates/feedparser-rs-core/src/parser/rss10.rs b/crates/feedparser-rs-core/src/parser/rss10.rs index bc9bd4d..98b9a3b 100644 --- a/crates/feedparser-rs-core/src/parser/rss10.rs +++ b/crates/feedparser-rs-core/src/parser/rss10.rs @@ -10,14 +10,14 @@ use crate::{ ParserLimits, error::{FeedError, Result}, - namespace::dublin_core, + namespace::{content, dublin_core, syndication}, types::{Entry, FeedVersion, Image, ParsedFeed, TextConstruct, TextType}, }; use quick_xml::{Reader, events::Event}; use super::common::{ - EVENT_BUFFER_CAPACITY, LimitedCollectionExt, check_depth, init_feed, is_dc_tag, read_text, - skip_element, + EVENT_BUFFER_CAPACITY, LimitedCollectionExt, check_depth, init_feed, is_content_tag, is_dc_tag, + is_syn_tag, read_text, skip_element, }; /// Parse RSS 1.0 (RDF) feed from raw bytes @@ -223,6 +223,10 @@ fn parse_channel( let dc_elem = dc_element.to_string(); let text = read_text(reader, &mut buf, limits)?; dublin_core::handle_feed_element(&dc_elem, &text, &mut feed.feed); + } else if let Some(syn_element) = is_syn_tag(full_name.as_ref()) { + let syn_elem = syn_element.to_string(); + let text = read_text(reader, &mut buf, limits)?; + syndication::handle_feed_element(&syn_elem, &text, &mut feed.feed); } else { skip_element(reader, &mut buf, limits, *depth)?; } @@ -288,6 +292,10 @@ fn parse_item( let text = read_text(reader, buf, limits)?; // dublin_core::handle_entry_element already handles dc:date -> published dublin_core::handle_entry_element(&dc_elem, &text, &mut entry); + } else if let Some(content_element) = is_content_tag(full_name.as_ref()) { + let content_elem = content_element.to_string(); + let text = read_text(reader, buf, limits)?; + content::handle_entry_element(&content_elem, &text, &mut entry); } else { skip_element(reader, buf, limits, *depth)?; } @@ -568,4 +576,64 @@ mod tests { assert!(is_dc_tag(b"link").is_none()); assert!(is_dc_tag(b"atom:title").is_none()); } + + #[test] + fn test_parse_rss10_with_content_encoded() { + let xml = br#" + + + Test + http://example.com + Test + + + Item 1 + http://example.com/1 + Brief summary + Full HTML content

]]>
+
+
"#; + + let feed = parse_rss10(xml).unwrap(); + assert_eq!(feed.entries.len(), 1); + + let entry = &feed.entries[0]; + assert_eq!(entry.summary.as_deref(), Some("Brief summary")); + + // Verify content:encoded is parsed + assert!(!entry.content.is_empty()); + assert_eq!(entry.content[0].content_type.as_deref(), Some("text/html")); + assert!(entry.content[0].value.contains("Full")); + assert!(entry.content[0].value.contains("HTML")); + } + + #[test] + fn test_parse_rss10_with_syndication() { + let xml = br#" + + + Test + http://example.com + Test + hourly + 2 + 2024-01-01T00:00:00Z + + "#; + + let feed = parse_rss10(xml).unwrap(); + assert!(feed.feed.syndication.is_some()); + + let syn = feed.feed.syndication.as_ref().unwrap(); + assert_eq!( + syn.update_period, + Some(crate::namespace::syndication::UpdatePeriod::Hourly) + ); + assert_eq!(syn.update_frequency, Some(2)); + assert_eq!(syn.update_base.as_deref(), Some("2024-01-01T00:00:00Z")); + } } diff --git a/crates/feedparser-rs-core/src/types/feed.rs b/crates/feedparser-rs-core/src/types/feed.rs index 247f61d..9419b6f 100644 --- a/crates/feedparser-rs-core/src/types/feed.rs +++ b/crates/feedparser-rs-core/src/types/feed.rs @@ -5,6 +5,7 @@ use super::{ podcast::{ItunesFeedMeta, PodcastMeta}, version::FeedVersion, }; +use crate::namespace::syndication::SyndicationMeta; use crate::{ParserLimits, error::Result}; use chrono::{DateTime, Utc}; use quick_xml::Reader; @@ -75,6 +76,8 @@ pub struct FeedMeta { pub dc_rights: Option, /// License URL (Creative Commons, etc.) pub license: Option, + /// Syndication module metadata (RSS 1.0) + pub syndication: Option, } /// Parsed feed result diff --git a/crates/feedparser-rs-core/tests/test_rss10.rs b/crates/feedparser-rs-core/tests/test_rss10.rs new file mode 100644 index 0000000..f77dce3 --- /dev/null +++ b/crates/feedparser-rs-core/tests/test_rss10.rs @@ -0,0 +1,520 @@ +//! Integration tests for RSS 1.0 (RDF) parser +//! +//! Tests comprehensive RSS 1.0 feed parsing including: +//! - Basic channel and item elements +//! - Dublin Core namespace support +//! - Content namespace support +//! - RDF structure handling +//! - Malformed feed tolerance (bozo pattern) + +use chrono::{Datelike, Timelike}; +use feedparser_rs::{FeedVersion, ParserLimits, namespace::syndication::UpdatePeriod, parse}; +use std::fmt::Write as _; + +#[test] +fn test_basic_rss10_feed() { + let xml = br#" + + + Example RSS 1.0 Feed + http://example.com + This is an example RSS 1.0 feed + + + First Article + http://example.com/article1 + Summary of first article + + + Second Article + http://example.com/article2 + Summary of second article + + "#; + + let feed = parse(xml).expect("Failed to parse RSS 1.0 feed"); + + assert_eq!(feed.version, FeedVersion::Rss10); + assert!(!feed.bozo, "Feed should not be marked as bozo"); + + // Check feed metadata + assert_eq!(feed.feed.title.as_deref(), Some("Example RSS 1.0 Feed")); + assert_eq!(feed.feed.link.as_deref(), Some("http://example.com")); + assert_eq!( + feed.feed.subtitle.as_deref(), + Some("This is an example RSS 1.0 feed") + ); + assert_eq!(feed.feed.id.as_deref(), Some("http://example.com/")); + + // Check entries + assert_eq!(feed.entries.len(), 2); + + let first = &feed.entries[0]; + assert_eq!(first.title.as_deref(), Some("First Article")); + assert_eq!(first.link.as_deref(), Some("http://example.com/article1")); + assert_eq!(first.id.as_deref(), Some("http://example.com/article1")); + assert_eq!(first.summary.as_deref(), Some("Summary of first article")); + + let second = &feed.entries[1]; + assert_eq!(second.title.as_deref(), Some("Second Article")); + assert_eq!(second.link.as_deref(), Some("http://example.com/article2")); +} + +#[test] +fn test_rss10_with_dublin_core() { + let xml = br#" + + + News Feed + http://example.com + Daily news + Editorial Team + Copyright 2024 Example Corp + 2024-12-18T10:00:00Z + en-US + + + Breaking News + http://example.com/news1 + Important announcement + John Doe + 2024-12-18T09:30:00Z + politics + + "#; + + let feed = parse(xml).expect("Failed to parse RSS 1.0 with Dublin Core"); + + assert_eq!(feed.version, FeedVersion::Rss10); + assert!(!feed.bozo); + + // Check Dublin Core elements at feed level + assert_eq!(feed.feed.dc_creator.as_deref(), Some("Editorial Team")); + assert_eq!( + feed.feed.dc_rights.as_deref(), + Some("Copyright 2024 Example Corp") + ); + // dc:language is mapped to feed.language + assert_eq!(feed.feed.language.as_deref(), Some("en-US")); + + // Check Dublin Core elements at entry level + assert_eq!(feed.entries.len(), 1); + let entry = &feed.entries[0]; + assert_eq!(entry.author.as_deref(), Some("John Doe")); + assert!( + entry.published.is_some(), + "dc:date should be parsed as published date" + ); + + if let Some(published) = entry.published { + assert_eq!(published.year(), 2024); + assert_eq!(published.month(), 12); + assert_eq!(published.day(), 18); + assert_eq!(published.hour(), 9); + assert_eq!(published.minute(), 30); + } +} + +#[test] +fn test_rss10_with_content_encoded() { + let xml = br#" + + + Blog + http://example.com + Tech blog + + + Using RSS 1.0 + http://example.com/post1 + Brief summary + This is the full HTML content of the post.

+

It includes formatting and multiple paragraphs.

+ ]]>
+
+
"#; + + let feed = parse(xml).expect("Failed to parse RSS 1.0 with content:encoded"); + + assert_eq!(feed.version, FeedVersion::Rss10); + assert!(!feed.bozo); + + assert_eq!(feed.entries.len(), 1); + let entry = &feed.entries[0]; + + // Check that summary is populated from description + assert_eq!(entry.summary.as_deref(), Some("Brief summary")); + + // Check that content:encoded is parsed + assert!( + !entry.content.is_empty(), + "content:encoded should be parsed" + ); + assert_eq!(entry.content[0].content_type.as_deref(), Some("text/html")); + assert!(entry.content[0].value.contains("full HTML content")); +} + +#[test] +fn test_rss10_with_image() { + let xml = br#" + + + Example Feed + http://example.com + Example + + + http://example.com/logo.png + Example Logo + http://example.com + + "#; + + let feed = parse(xml).expect("Failed to parse RSS 1.0 with image"); + + assert_eq!(feed.version, FeedVersion::Rss10); + assert!(!feed.bozo); + + assert!(feed.feed.image.is_some(), "Image should be parsed"); + let image = feed.feed.image.as_ref().unwrap(); + assert_eq!(image.url, "http://example.com/logo.png"); + assert_eq!(image.title.as_deref(), Some("Example Logo")); + assert_eq!(image.link.as_deref(), Some("http://example.com")); +} + +#[test] +fn test_rss10_empty_items() { + let xml = br#" + + + Empty Feed + http://example.com + Feed with no items + + "#; + + let feed = parse(xml).expect("Failed to parse RSS 1.0 with empty items"); + + assert_eq!(feed.version, FeedVersion::Rss10); + assert!(!feed.bozo); + assert_eq!(feed.entries.len(), 0); + assert_eq!(feed.feed.title.as_deref(), Some("Empty Feed")); +} + +#[test] +fn test_rss10_missing_required_fields() { + let xml = br#" + + + Incomplete Feed + + + + + Only has description + + "#; + + let feed = parse(xml).expect("Parser should be tolerant of missing fields"); + + assert_eq!(feed.version, FeedVersion::Rss10); + // Should still extract what's available + assert_eq!(feed.feed.title.as_deref(), Some("Incomplete Feed")); + assert_eq!(feed.entries.len(), 1); + assert_eq!( + feed.entries[0].summary.as_deref(), + Some("Only has description") + ); +} + +#[test] +fn test_rss10_malformed_xml_bozo() { + let xml = br#" + + + Test Feed + http://example.com + Test + + + Unclosed title + <link>http://example.com/1</link> + </item> + </rdf:RDF>"#; + + let feed = parse(xml).expect("Should parse despite malformed XML"); + + assert_eq!(feed.version, FeedVersion::Rss10); + // Bozo pattern: continue parsing but set flag + // Note: quick-xml in tolerant mode may or may not set bozo depending on how it handles this + // At minimum, feed metadata should be extracted + assert_eq!(feed.feed.title.as_deref(), Some("Test Feed")); +} + +#[test] +fn test_rss10_entry_limit() { + let xml = br#"<?xml version="1.0"?> + <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns="http://purl.org/rss/1.0/"> + <channel rdf:about="http://example.com/"> + <title>Limited Feed + http://example.com + Test entry limits + + + Item 1 + http://example.com/1 + + + Item 2 + http://example.com/2 + + + Item 3 + http://example.com/3 + + + Item 4 + http://example.com/4 + + "#; + + let limits = ParserLimits { + max_entries: 2, + ..Default::default() + }; + + let feed = + feedparser_rs::parse_with_limits(xml, limits).expect("Failed to parse with entry limit"); + + assert_eq!(feed.entries.len(), 2); + assert!(feed.bozo, "Should set bozo flag when limit exceeded"); + assert!( + feed.bozo_exception + .as_ref() + .unwrap() + .contains("Entry limit exceeded") + ); +} + +#[test] +fn test_rss10_without_rdf_prefix() { + let xml = br#" + + + No Prefix Feed + http://example.com + RSS 1.0 without rdf: prefix + + + Item Title + http://example.com/1 + + "#; + + let feed = parse(xml).expect("Should parse RDF without rdf: prefix"); + + assert_eq!(feed.version, FeedVersion::Rss10); + assert_eq!(feed.feed.title.as_deref(), Some("No Prefix Feed")); + assert_eq!(feed.entries.len(), 1); +} + +#[test] +fn test_rss10_nesting_depth_limit() { + let mut xml = String::from( + r#" + + + Deep Nesting + http://example.com + Test nesting limits"#, + ); + + // Create deeply nested structure (exceed default max_nesting_depth) + for i in 0..150 { + write!(&mut xml, "").unwrap(); + } + for i in (0..150).rev() { + write!(&mut xml, "").unwrap(); + } + + xml.push_str( + r" + + ", + ); + + let feed = parse(xml.as_bytes()).expect("Should handle deep nesting"); + + // Should set bozo flag when depth limit exceeded + assert!( + feed.bozo, + "Should set bozo flag for excessive nesting depth" + ); + assert!( + feed.bozo_exception + .as_ref() + .is_some_and(|e| e.contains("nesting depth") || e.contains("exceeds maximum")) + ); +} + +#[test] +fn test_rss10_real_world_slashdot_like() { + let xml = br#" + + + Slashdot + http://slashdot.org/ + News for nerds, stuff that matters + en-us + Copyright 1997-2024, OSDN + 2024-12-18T10:00:00+00:00 + OSDN + + + + + + + + + New Technology Breakthrough + http://slashdot.org/story/1 + Scientists discover amazing things + BeauHD + 2024-12-18T08:30:00+00:00 + science + + + Open Source Project Released + http://slashdot.org/story/2 + New version available for download + msmash + 2024-12-18T07:15:00+00:00 + opensource + + "#; + + let feed = parse(xml).expect("Failed to parse Slashdot-like RSS 1.0"); + + assert_eq!(feed.version, FeedVersion::Rss10); + assert!(!feed.bozo); + + // Feed metadata + assert_eq!(feed.feed.title.as_deref(), Some("Slashdot")); + assert_eq!(feed.feed.link.as_deref(), Some("http://slashdot.org/")); + assert_eq!( + feed.feed.subtitle.as_deref(), + Some("News for nerds, stuff that matters") + ); + // dc:language is mapped to feed.language + assert_eq!(feed.feed.language.as_deref(), Some("en-us")); + assert_eq!( + feed.feed.dc_rights.as_deref(), + Some("Copyright 1997-2024, OSDN") + ); + + // Entries + assert_eq!(feed.entries.len(), 2); + + let first = &feed.entries[0]; + assert_eq!(first.title.as_deref(), Some("New Technology Breakthrough")); + assert_eq!(first.author.as_deref(), Some("BeauHD")); + assert!(first.published.is_some()); + + let second = &feed.entries[1]; + assert_eq!( + second.title.as_deref(), + Some("Open Source Project Released") + ); + assert_eq!(second.author.as_deref(), Some("msmash")); +} + +#[test] +fn test_rss10_version_string() { + let xml = br#" + + + Test + http://example.com + Test + + "#; + + let feed = parse(xml).expect("Failed to parse"); + + // Verify version string matches Python feedparser convention + assert_eq!(feed.version.as_str(), "rss10"); + assert_eq!(format!("{}", feed.version), "rss10"); +} + +#[test] +fn test_rss10_with_syndication_module() { + let xml = br#" + + + Auto-Updated Feed + http://example.com + This feed updates every 2 hours + hourly + 2 + 2024-01-01T00:00:00Z + + + Test Item + http://example.com/1 + Test description + + "#; + + let feed = parse(xml).expect("Failed to parse RSS 1.0 with syndication"); + + assert_eq!(feed.version, FeedVersion::Rss10); + assert!(!feed.bozo); + + // Verify syndication metadata + assert!( + feed.feed.syndication.is_some(), + "Syndication metadata should be present" + ); + + let syn = feed.feed.syndication.as_ref().unwrap(); + + // Check update period (hourly) + assert_eq!( + syn.update_period, + Some(UpdatePeriod::Hourly), + "Update period should be hourly" + ); + + // Check update frequency (2 times per period) + assert_eq!( + syn.update_frequency, + Some(2), + "Update frequency should be 2" + ); + + // Check update base timestamp + assert_eq!( + syn.update_base.as_deref(), + Some("2024-01-01T00:00:00Z"), + "Update base should be preserved" + ); +} diff --git a/crates/feedparser-rs-node/__test__/syndication.spec.mjs b/crates/feedparser-rs-node/__test__/syndication.spec.mjs new file mode 100644 index 0000000..3661da7 --- /dev/null +++ b/crates/feedparser-rs-node/__test__/syndication.spec.mjs @@ -0,0 +1,147 @@ +import { describe, it } from 'node:test'; +import assert from 'node:assert'; +import { parse } from '../index.js'; + +describe('syndication', () => { + it('should parse syndication updatePeriod', () => { + const xml = ` + + + Test Feed + https://example.com + daily + + `; + + const feed = parse(xml); + assert.ok(feed.feed.syndication); + assert.strictEqual(feed.feed.syndication.updatePeriod, 'daily'); + }); + + it('should parse syndication updateFrequency', () => { + const xml = ` + + + Test Feed + https://example.com + 2 + + `; + + const feed = parse(xml); + assert.ok(feed.feed.syndication); + assert.strictEqual(feed.feed.syndication.updateFrequency, 2); + }); + + it('should parse complete syndication metadata', () => { + const xml = ` + + + Test Feed + https://example.com + hourly + 1 + 2024-01-01T00:00:00Z + + `; + + const feed = parse(xml); + const syn = feed.feed.syndication; + assert.ok(syn); + assert.strictEqual(syn.updatePeriod, 'hourly'); + assert.strictEqual(syn.updateFrequency, 1); + assert.strictEqual(syn.updateBase, '2024-01-01T00:00:00Z'); + }); + + it('should return undefined when syndication data is missing', () => { + const xml = ` + + + Test Feed + https://example.com + + `; + + const feed = parse(xml); + assert.strictEqual(feed.feed.syndication, undefined); + }); + + it('should parse Dublin Core fields', () => { + const xml = ` + + + Test Feed + https://example.com + John Doe + ACME Corp + Copyright 2024 + + `; + + const feed = parse(xml); + assert.strictEqual(feed.feed.dcCreator, 'John Doe'); + assert.strictEqual(feed.feed.dcPublisher, 'ACME Corp'); + assert.strictEqual(feed.feed.dcRights, 'Copyright 2024'); + }); + + it('should handle invalid updatePeriod gracefully (bozo pattern)', () => { + const xml = ` + + + Test + https://example.com + invalid + + `; + + const feed = parse(xml); + // Should not crash, syndication should be undefined or updatePeriod undefined + assert.ok(!feed.feed.syndication || !feed.feed.syndication.updatePeriod); + }); + + it('should handle case-insensitive updatePeriod', () => { + const xml = ` + + + Test + https://example.com + HOURLY + + `; + + const feed = parse(xml); + assert.ok(feed.feed.syndication); + assert.strictEqual(feed.feed.syndication.updatePeriod, 'hourly'); + }); + + it('should parse feed with partial syndication fields', () => { + const xml = ` + + + Test + https://example.com + weekly + + `; + + const feed = parse(xml); + assert.ok(feed.feed.syndication); + assert.strictEqual(feed.feed.syndication.updatePeriod, 'weekly'); + assert.strictEqual(feed.feed.syndication.updateFrequency, undefined); + assert.strictEqual(feed.feed.syndication.updateBase, undefined); + }); +}); diff --git a/crates/feedparser-rs-node/index.d.ts b/crates/feedparser-rs-node/index.d.ts index c2c0687..a07033b 100644 --- a/crates/feedparser-rs-node/index.d.ts +++ b/crates/feedparser-rs-node/index.d.ts @@ -143,6 +143,14 @@ export interface FeedMeta { ttl?: number /** License URL (Creative Commons, etc.) */ license?: string + /** Syndication module metadata (RSS 1.0) */ + syndication?: SyndicationMeta + /** Dublin Core creator (author fallback) */ + dcCreator?: string + /** Dublin Core publisher */ + dcPublisher?: string + /** Dublin Core rights (copyright) */ + dcRights?: string } /** Generator metadata */ @@ -369,6 +377,16 @@ export interface Source { id?: string } +/** Syndication module metadata (RSS 1.0) */ +export interface SyndicationMeta { + /** Update period (hourly, daily, weekly, monthly, yearly) */ + updatePeriod?: string + /** Number of times updated per period */ + updateFrequency?: number + /** Base date for update schedule (ISO 8601) */ + updateBase?: string +} + /** Tag/category */ export interface Tag { /** Tag term/label */ diff --git a/crates/feedparser-rs-node/src/lib.rs b/crates/feedparser-rs-node/src/lib.rs index 02060db..b11f692 100644 --- a/crates/feedparser-rs-node/src/lib.rs +++ b/crates/feedparser-rs-node/src/lib.rs @@ -9,7 +9,8 @@ use feedparser_rs::{ FeedMeta as CoreFeedMeta, Generator as CoreGenerator, Image as CoreImage, Link as CoreLink, ParsedFeed as CoreParsedFeed, ParserLimits, Person as CorePerson, PodcastPerson as CorePodcastPerson, PodcastTranscript as CorePodcastTranscript, - Source as CoreSource, Tag as CoreTag, TextConstruct as CoreTextConstruct, TextType, + Source as CoreSource, SyndicationMeta as CoreSyndicationMeta, Tag as CoreTag, + TextConstruct as CoreTextConstruct, TextType, }; /// Default maximum feed size (100 MB) - prevents DoS attacks @@ -264,6 +265,27 @@ impl From for ParsedFeed { } } +/// Syndication module metadata (RSS 1.0) +#[napi(object)] +pub struct SyndicationMeta { + /// Update period (hourly, daily, weekly, monthly, yearly) + pub update_period: Option, + /// Number of times updated per period + pub update_frequency: Option, + /// Base date for update schedule (ISO 8601) + pub update_base: Option, +} + +impl From for SyndicationMeta { + fn from(core: CoreSyndicationMeta) -> Self { + Self { + update_period: core.update_period.map(|p| p.as_str().to_string()), + update_frequency: core.update_frequency, + update_base: core.update_base, + } + } +} + /// Feed metadata #[napi(object)] pub struct FeedMeta { @@ -319,6 +341,14 @@ pub struct FeedMeta { pub ttl: Option, /// License URL (Creative Commons, etc.) pub license: Option, + /// Syndication module metadata (RSS 1.0) + pub syndication: Option, + /// Dublin Core creator (author fallback) + pub dc_creator: Option, + /// Dublin Core publisher + pub dc_publisher: Option, + /// Dublin Core rights (copyright) + pub dc_rights: Option, } impl From for FeedMeta { @@ -350,6 +380,10 @@ impl From for FeedMeta { id: core.id, ttl: core.ttl, license: core.license, + syndication: core.syndication.map(SyndicationMeta::from), + dc_creator: core.dc_creator, + dc_publisher: core.dc_publisher, + dc_rights: core.dc_rights, } } } @@ -411,77 +445,39 @@ pub struct Entry { impl From for Entry { fn from(core: CoreEntry) -> Self { - // Pre-allocate Vec capacity to avoid reallocations - let links_cap = core.links.len(); - let content_cap = core.content.len(); - let authors_cap = core.authors.len(); - let contributors_cap = core.contributors.len(); - let tags_cap = core.tags.len(); - let enclosures_cap = core.enclosures.len(); - let transcripts_cap = core.podcast_transcripts.len(); - let persons_cap = core.podcast_persons.len(); - Self { id: core.id, title: core.title, title_detail: core.title_detail.map(TextConstruct::from), link: core.link, - links: { - let mut v = Vec::with_capacity(links_cap); - v.extend(core.links.into_iter().map(Link::from)); - v - }, + links: core.links.into_iter().map(Link::from).collect(), summary: core.summary, summary_detail: core.summary_detail.map(TextConstruct::from), - content: { - let mut v = Vec::with_capacity(content_cap); - v.extend(core.content.into_iter().map(Content::from)); - v - }, + content: core.content.into_iter().map(Content::from).collect(), published: core.published.map(|dt| dt.timestamp_millis()), updated: core.updated.map(|dt| dt.timestamp_millis()), created: core.created.map(|dt| dt.timestamp_millis()), expired: core.expired.map(|dt| dt.timestamp_millis()), author: core.author, author_detail: core.author_detail.map(Person::from), - authors: { - let mut v = Vec::with_capacity(authors_cap); - v.extend(core.authors.into_iter().map(Person::from)); - v - }, - contributors: { - let mut v = Vec::with_capacity(contributors_cap); - v.extend(core.contributors.into_iter().map(Person::from)); - v - }, + authors: core.authors.into_iter().map(Person::from).collect(), + contributors: core.contributors.into_iter().map(Person::from).collect(), publisher: core.publisher, publisher_detail: core.publisher_detail.map(Person::from), - tags: { - let mut v = Vec::with_capacity(tags_cap); - v.extend(core.tags.into_iter().map(Tag::from)); - v - }, - enclosures: { - let mut v = Vec::with_capacity(enclosures_cap); - v.extend(core.enclosures.into_iter().map(Enclosure::from)); - v - }, + tags: core.tags.into_iter().map(Tag::from).collect(), + enclosures: core.enclosures.into_iter().map(Enclosure::from).collect(), comments: core.comments, source: core.source.map(Source::from), - podcast_transcripts: { - let mut v = Vec::with_capacity(transcripts_cap); - v.extend( - core.podcast_transcripts - .into_iter() - .map(PodcastTranscript::from), - ); - v - }, - podcast_persons: { - let mut v = Vec::with_capacity(persons_cap); - v.extend(core.podcast_persons.into_iter().map(PodcastPerson::from)); - v - }, + podcast_transcripts: core + .podcast_transcripts + .into_iter() + .map(PodcastTranscript::from) + .collect(), + podcast_persons: core + .podcast_persons + .into_iter() + .map(PodcastPerson::from) + .collect(), license: core.license, } } diff --git a/crates/feedparser-rs-py/src/types/feed_meta.rs b/crates/feedparser-rs-py/src/types/feed_meta.rs index 5b8c3da..9c241d8 100644 --- a/crates/feedparser-rs-py/src/types/feed_meta.rs +++ b/crates/feedparser-rs-py/src/types/feed_meta.rs @@ -4,6 +4,7 @@ use pyo3::prelude::*; use super::common::{PyGenerator, PyImage, PyLink, PyPerson, PyTag, PyTextConstruct}; use super::datetime::optional_datetime_to_struct_time; use super::podcast::{PyItunesFeedMeta, PyPodcastMeta}; +use super::syndication::PySyndicationMeta; #[pyclass(name = "FeedMeta", module = "feedparser_rs")] #[derive(Clone)] @@ -212,6 +213,29 @@ impl PyFeedMeta { self.inner.license.as_deref() } + #[getter] + fn syndication(&self) -> Option { + self.inner + .syndication + .as_ref() + .map(|s| PySyndicationMeta::from_core(s.clone())) + } + + #[getter] + fn dc_creator(&self) -> Option<&str> { + self.inner.dc_creator.as_deref() + } + + #[getter] + fn dc_publisher(&self) -> Option<&str> { + self.inner.dc_publisher.as_deref() + } + + #[getter] + fn dc_rights(&self) -> Option<&str> { + self.inner.dc_rights.as_deref() + } + fn __repr__(&self) -> String { format!( "FeedMeta(title='{}', link='{}')", diff --git a/crates/feedparser-rs-py/src/types/mod.rs b/crates/feedparser-rs-py/src/types/mod.rs index df99f58..cafb245 100644 --- a/crates/feedparser-rs-py/src/types/mod.rs +++ b/crates/feedparser-rs-py/src/types/mod.rs @@ -4,5 +4,6 @@ pub mod entry; pub mod feed_meta; pub mod parsed_feed; pub mod podcast; +pub mod syndication; pub use parsed_feed::PyParsedFeed; diff --git a/crates/feedparser-rs-py/src/types/syndication.rs b/crates/feedparser-rs-py/src/types/syndication.rs new file mode 100644 index 0000000..684741c --- /dev/null +++ b/crates/feedparser-rs-py/src/types/syndication.rs @@ -0,0 +1,45 @@ +use feedparser_rs::SyndicationMeta as CoreSyndicationMeta; +use pyo3::prelude::*; + +/// Syndication module metadata +#[pyclass(name = "SyndicationMeta", module = "feedparser_rs")] +#[derive(Clone)] +pub struct PySyndicationMeta { + inner: CoreSyndicationMeta, +} + +impl PySyndicationMeta { + pub fn from_core(core: CoreSyndicationMeta) -> Self { + Self { inner: core } + } +} + +#[pymethods] +impl PySyndicationMeta { + /// Update period (hourly, daily, weekly, monthly, yearly) + #[getter] + fn update_period(&self) -> Option<&str> { + self.inner.update_period.as_ref().map(|p| p.as_str()) + } + + /// Number of times updated per period + #[getter] + fn update_frequency(&self) -> Option { + self.inner.update_frequency + } + + /// Base date for update schedule (ISO 8601) + #[getter] + fn update_base(&self) -> Option<&str> { + self.inner.update_base.as_deref() + } + + fn __repr__(&self) -> String { + format!( + "SyndicationMeta(update_period={:?}, update_frequency={:?}, update_base={:?})", + self.inner.update_period.as_ref().map(|p| p.as_str()), + self.inner.update_frequency, + self.inner.update_base.as_deref() + ) + } +} diff --git a/crates/feedparser-rs-py/tests/test_syndication.py b/crates/feedparser-rs-py/tests/test_syndication.py new file mode 100644 index 0000000..fc7ba9d --- /dev/null +++ b/crates/feedparser-rs-py/tests/test_syndication.py @@ -0,0 +1,166 @@ +import feedparser_rs + + +def test_syndication_update_period(): + """Test syn:updatePeriod parsing""" + feed_xml = b""" + + + Test Feed + https://example.com + daily + + """ + + d = feedparser_rs.parse(feed_xml) + assert d.feed.syndication is not None + assert d.feed.syndication.update_period == "daily" + + +def test_syndication_update_frequency(): + """Test syn:updateFrequency parsing""" + feed_xml = b""" + + + Test Feed + https://example.com + 2 + + """ + + d = feedparser_rs.parse(feed_xml) + assert d.feed.syndication is not None + assert d.feed.syndication.update_frequency == 2 + + +def test_syndication_update_base(): + """Test syn:updateBase parsing""" + feed_xml = b""" + + + Test Feed + https://example.com + 2024-12-18T00:00:00Z + + """ + + d = feedparser_rs.parse(feed_xml) + assert d.feed.syndication is not None + assert d.feed.syndication.update_base == "2024-12-18T00:00:00Z" + + +def test_syndication_complete(): + """Test all syndication fields together""" + feed_xml = b""" + + + Test Feed + https://example.com + hourly + 1 + 2024-01-01T00:00:00Z + + """ + + d = feedparser_rs.parse(feed_xml) + syn = d.feed.syndication + assert syn is not None + assert syn.update_period == "hourly" + assert syn.update_frequency == 1 + assert syn.update_base == "2024-01-01T00:00:00Z" + + +def test_syndication_missing(): + """Test feed without syndication data""" + feed_xml = b""" + + + Test Feed + https://example.com + + """ + + d = feedparser_rs.parse(feed_xml) + assert d.feed.syndication is None + + +def test_dublin_core_fields(): + """Test Dublin Core fields""" + feed_xml = b""" + + + Test Feed + https://example.com + John Doe + ACME Corp + Copyright 2024 + + """ + + d = feedparser_rs.parse(feed_xml) + assert d.feed.dc_creator == "John Doe" + assert d.feed.dc_publisher == "ACME Corp" + assert d.feed.dc_rights == "Copyright 2024" + + +def test_invalid_update_period(): + """Test invalid updatePeriod is handled gracefully (bozo pattern)""" + feed_xml = b""" + + + Test + https://example.com + invalid + + """ + d = feedparser_rs.parse(feed_xml) + # Should not crash, syndication should be None or update_period None + assert d.feed.syndication is None or d.feed.syndication.update_period is None + + +def test_case_insensitive_update_period(): + """Test updatePeriod is case-insensitive""" + feed_xml = b""" + + + Test + https://example.com + HOURLY + + """ + d = feedparser_rs.parse(feed_xml) + assert d.feed.syndication is not None + assert d.feed.syndication.update_period == "hourly" + + +def test_partial_syndication(): + """Test feed with only some syndication fields""" + feed_xml = b""" + + + Test + https://example.com + weekly + + """ + d = feedparser_rs.parse(feed_xml) + assert d.feed.syndication is not None + assert d.feed.syndication.update_period == "weekly" + assert d.feed.syndication.update_frequency is None + assert d.feed.syndication.update_base is None