diff --git a/crates/feedparser-rs-core/src/parser/atom.rs b/crates/feedparser-rs-core/src/parser/atom.rs index 4e48f74..1f2e4cd 100644 --- a/crates/feedparser-rs-core/src/parser/atom.rs +++ b/crates/feedparser-rs-core/src/parser/atom.rs @@ -136,6 +136,10 @@ fn parse_feed_element( { feed.feed.link = Some(link.href.clone()); } + if feed.feed.license.is_none() && link.rel.as_deref() == Some("license") + { + feed.feed.license = Some(link.href.clone()); + } feed.feed .links .try_push_limited(link, limits.max_links_per_feed); @@ -305,6 +309,9 @@ fn parse_entry( if entry.link.is_none() && link.rel.as_deref() == Some("alternate") { entry.link = Some(link.href.clone()); } + if entry.license.is_none() && link.rel.as_deref() == Some("license") { + entry.license = Some(link.href.clone()); + } entry .links .try_push_limited(link, limits.max_links_per_entry); @@ -926,4 +933,45 @@ mod tests { assert_eq!(feed.feed.links.len(), 1); assert_eq!(feed.feed.tags.len(), 1); } + + #[test] + fn test_parse_atom_license_feed() { + let xml = br#" + + Test Feed + + + "#; + + let feed = parse_atom10(xml).unwrap(); + assert_eq!( + feed.feed.license.as_deref(), + Some("https://creativecommons.org/licenses/by/4.0/") + ); + assert_eq!(feed.feed.link.as_deref(), Some("https://example.com/")); + } + + #[test] + fn test_parse_atom_license_entry() { + let xml = br#" + + + Licensed Entry + urn:uuid:1 + + + + "#; + + let feed = parse_atom10(xml).unwrap(); + assert_eq!(feed.entries.len(), 1); + assert_eq!( + feed.entries[0].license.as_deref(), + Some("https://creativecommons.org/licenses/by-sa/3.0/") + ); + assert_eq!( + feed.entries[0].link.as_deref(), + Some("https://example.com/entry/1") + ); + } } diff --git a/crates/feedparser-rs-core/src/parser/common.rs b/crates/feedparser-rs-core/src/parser/common.rs index efa5540..569c24b 100644 --- a/crates/feedparser-rs-core/src/parser/common.rs +++ b/crates/feedparser-rs-core/src/parser/common.rs @@ -285,6 +285,47 @@ pub fn extract_xml_base( .map(|s| s.to_string()) } +/// Extract xml:lang attribute from element +/// +/// Returns the language code if xml:lang or lang attribute exists. +/// Respects `max_attribute_length` limit for `DoS` protection. +/// +/// # Arguments +/// +/// * `element` - The XML element to extract xml:lang from +/// * `max_attr_length` - Maximum allowed attribute length (`DoS` protection) +/// +/// # Returns +/// +/// * `Some(String)` - The xml:lang value if found and within length limit +/// * `None` - If attribute not found or exceeds length limit +/// +/// # Examples +/// +/// ```ignore +/// use feedparser_rs::parser::common::extract_xml_lang; +/// +/// let element = /* BytesStart from quick-xml */; +/// if let Some(lang) = extract_xml_lang(&element, 1024) { +/// println!("Language: {}", lang); +/// } +/// ``` +pub fn extract_xml_lang( + element: &quick_xml::events::BytesStart, + max_attr_length: usize, +) -> Option { + element + .attributes() + .flatten() + .find(|attr| { + let key = attr.key.as_ref(); + key == b"xml:lang" || key == b"lang" + }) + .filter(|attr| attr.value.len() <= max_attr_length) + .and_then(|attr| attr.unescape_value().ok()) + .map(|s| s.to_string()) +} + /// Read text content from current XML element (handles text and CDATA) pub fn read_text( reader: &mut Reader<&[u8]>, diff --git a/crates/feedparser-rs-core/src/parser/rss.rs b/crates/feedparser-rs-core/src/parser/rss.rs index 6223c33..0d40799 100644 --- a/crates/feedparser-rs-core/src/parser/rss.rs +++ b/crates/feedparser-rs-core/src/parser/rss.rs @@ -15,8 +15,8 @@ use crate::{ use quick_xml::{Reader, events::Event}; use super::common::{ - EVENT_BUFFER_CAPACITY, LimitedCollectionExt, check_depth, init_feed, is_content_tag, is_dc_tag, - is_itunes_tag, is_media_tag, read_text, skip_element, + EVENT_BUFFER_CAPACITY, LimitedCollectionExt, check_depth, extract_xml_lang, init_feed, + is_content_tag, is_dc_tag, is_itunes_tag, is_media_tag, read_text, skip_element, }; /// Error message for malformed XML attributes (shared constant) @@ -110,10 +110,16 @@ pub fn parse_rss20_with_limits(data: &[u8], limits: ParserLimits) -> Result { + let channel_lang = extract_xml_lang(&e, limits.max_attribute_length); depth += 1; - if let Err(e) = - parse_channel(&mut reader, &mut feed, &limits, &mut depth, &mut base_ctx) - { + if let Err(e) = parse_channel( + &mut reader, + &mut feed, + &limits, + &mut depth, + &mut base_ctx, + channel_lang.as_deref(), + ) { feed.bozo = true; feed.bozo_exception = Some(e.to_string()); } @@ -140,6 +146,7 @@ fn parse_channel( limits: &ParserLimits, depth: &mut usize, base_ctx: &mut BaseUrlContext, + channel_lang: Option<&str>, ) -> Result<()> { let mut buf = Vec::with_capacity(EVENT_BUFFER_CAPACITY); @@ -163,7 +170,15 @@ fn parse_channel( match tag.as_slice() { b"title" | b"link" | b"description" | b"language" | b"pubDate" | b"managingEditor" | b"webMaster" | b"generator" | b"ttl" | b"category" => { - parse_channel_standard(reader, &mut buf, &tag, feed, limits, base_ctx)?; + parse_channel_standard( + reader, + &mut buf, + &tag, + feed, + limits, + base_ctx, + channel_lang, + )?; } b"image" => { if let Ok(image) = parse_image(reader, &mut buf, limits, depth) { @@ -171,11 +186,16 @@ fn parse_channel( } } b"item" => { + let item_lang = extract_xml_lang(&e, limits.max_attribute_length); + if !feed.check_entry_limit(reader, &mut buf, limits, depth)? { continue; } - match parse_item(reader, &mut buf, limits, depth, base_ctx) { + let effective_lang = item_lang.as_deref().or(channel_lang); + + match parse_item(reader, &mut buf, limits, depth, base_ctx, effective_lang) + { Ok((entry, has_attr_errors)) => { if has_attr_errors { feed.bozo = true; @@ -261,10 +281,17 @@ fn parse_channel_standard( feed: &mut ParsedFeed, limits: &ParserLimits, base_ctx: &mut BaseUrlContext, + channel_lang: Option<&str>, ) -> Result<()> { match tag { b"title" => { - feed.feed.title = Some(read_text(reader, buf, limits)?); + let text = read_text(reader, buf, limits)?; + feed.feed.set_title(TextConstruct { + value: text, + content_type: TextType::Text, + language: channel_lang.map(String::from), + base: base_ctx.base().map(String::from), + }); } b"link" => { let link_text = read_text(reader, buf, limits)?; @@ -276,7 +303,13 @@ fn parse_channel_standard( } } b"description" => { - feed.feed.subtitle = Some(read_text(reader, buf, limits)?); + let text = read_text(reader, buf, limits)?; + feed.feed.set_subtitle(TextConstruct { + value: text, + content_type: TextType::Html, + language: channel_lang.map(String::from), + base: base_ctx.base().map(String::from), + }); } b"language" => { feed.feed.language = Some(read_text(reader, buf, limits)?); @@ -500,6 +533,9 @@ fn parse_channel_namespace( } else if let Some(_media_element) = is_media_tag(tag) { skip_element(reader, buf, limits, depth)?; Ok(true) + } else if tag.starts_with(b"creativeCommons:license") || tag == b"license" { + feed.feed.license = Some(read_text(reader, buf, limits)?); + Ok(true) } else { Ok(false) } @@ -516,6 +552,7 @@ fn parse_item( limits: &ParserLimits, depth: &mut usize, base_ctx: &BaseUrlContext, + item_lang: Option<&str>, ) -> Result<(Entry, bool)> { let mut entry = Entry::with_capacity(); let mut has_attr_errors = false; @@ -544,7 +581,9 @@ fn parse_item( match tag.as_slice() { b"title" | b"link" | b"description" | b"guid" | b"pubDate" | b"author" | b"category" | b"comments" => { - parse_item_standard(reader, buf, &tag, &mut entry, limits, base_ctx)?; + parse_item_standard( + reader, buf, &tag, &mut entry, limits, base_ctx, item_lang, + )?; } b"enclosure" => { if let Some(mut enclosure) = parse_enclosure(&attrs, limits) { @@ -603,10 +642,17 @@ fn parse_item_standard( entry: &mut Entry, limits: &ParserLimits, base_ctx: &BaseUrlContext, + item_lang: Option<&str>, ) -> Result<()> { match tag { b"title" => { - entry.title = Some(read_text(reader, buf, limits)?); + let text = read_text(reader, buf, limits)?; + entry.set_title(TextConstruct { + value: text, + content_type: TextType::Text, + language: item_lang.map(String::from), + base: base_ctx.base().map(String::from), + }); } b"link" => { let link_text = read_text(reader, buf, limits)?; @@ -622,13 +668,12 @@ fn parse_item_standard( ); } b"description" => { - let desc = read_text(reader, buf, limits)?; - entry.summary = Some(desc.clone()); - entry.summary_detail = Some(TextConstruct { - value: desc, + let text = read_text(reader, buf, limits)?; + entry.set_summary(TextConstruct { + value: text, content_type: TextType::Html, - language: None, - base: None, + language: item_lang.map(String::from), + base: base_ctx.base().map(String::from), }); } b"guid" => { @@ -857,6 +902,9 @@ fn parse_item_namespace( depth, )?; Ok(true) + } else if tag.starts_with(b"creativeCommons:license") || tag == b"license" { + entry.license = Some(read_text(reader, buf, limits)?); + Ok(true) } else { Ok(false) } @@ -1791,4 +1839,142 @@ mod tests { ); assert!(entry.content[0].value.contains("