Skip to content

Commit b8dc6ad

Browse files
authored
feat(parser): add xml:lang tracking and license field support (#27)
* feat: complete *_detail structures with xml:lang and license support Add comprehensive xml:lang tracking to RSS parser and license field support across all feed formats to complete Phase 2 of *_detail infrastructure. Changes: - Add extract_xml_lang() helper function to parser/common.rs - Track xml:lang at RSS channel and item levels with inheritance - Populate TextConstruct.language for title and description fields - Add license field to FeedMeta and Entry types - Parse creativeCommons:license and plain license tags in RSS - Extract license from link rel="license" in Atom feeds - Update Python and Node.js bindings with license getters - Add comprehensive integration tests for xml:lang and license The RSS parser now properly tracks xml:lang attributes on channel and item elements, propagating language information to TextConstruct fields. Item-level xml:lang overrides channel-level language, matching standard XML behavior. License URLs are extracted from both RSS (creativeCommons:license, license tags) and Atom (link rel="license") feeds, stored in the license field for both feed and entry metadata. Tests: 476 passed, clippy clean, all CI checks passing * fix(node): add license field to TypeScript definitions Add missing license field to Entry and FeedMeta interfaces in TypeScript definitions. This field was already implemented in the Rust code but was not exposed in the type definitions. Also add test for empty xml:lang attribute handling to verify edge case behavior. * style: apply nightly rustfmt formatting
1 parent 066ab85 commit b8dc6ad

File tree

9 files changed

+316
-17
lines changed

9 files changed

+316
-17
lines changed

crates/feedparser-rs-core/src/parser/atom.rs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,10 @@ fn parse_feed_element(
136136
{
137137
feed.feed.link = Some(link.href.clone());
138138
}
139+
if feed.feed.license.is_none() && link.rel.as_deref() == Some("license")
140+
{
141+
feed.feed.license = Some(link.href.clone());
142+
}
139143
feed.feed
140144
.links
141145
.try_push_limited(link, limits.max_links_per_feed);
@@ -305,6 +309,9 @@ fn parse_entry(
305309
if entry.link.is_none() && link.rel.as_deref() == Some("alternate") {
306310
entry.link = Some(link.href.clone());
307311
}
312+
if entry.license.is_none() && link.rel.as_deref() == Some("license") {
313+
entry.license = Some(link.href.clone());
314+
}
308315
entry
309316
.links
310317
.try_push_limited(link, limits.max_links_per_entry);
@@ -926,4 +933,45 @@ mod tests {
926933
assert_eq!(feed.feed.links.len(), 1);
927934
assert_eq!(feed.feed.tags.len(), 1);
928935
}
936+
937+
#[test]
938+
fn test_parse_atom_license_feed() {
939+
let xml = br#"<?xml version="1.0"?>
940+
<feed xmlns="http://www.w3.org/2005/Atom">
941+
<title>Test Feed</title>
942+
<link rel="license" href="https://creativecommons.org/licenses/by/4.0/"/>
943+
<link rel="alternate" href="https://example.com/"/>
944+
</feed>"#;
945+
946+
let feed = parse_atom10(xml).unwrap();
947+
assert_eq!(
948+
feed.feed.license.as_deref(),
949+
Some("https://creativecommons.org/licenses/by/4.0/")
950+
);
951+
assert_eq!(feed.feed.link.as_deref(), Some("https://example.com/"));
952+
}
953+
954+
#[test]
955+
fn test_parse_atom_license_entry() {
956+
let xml = br#"<?xml version="1.0"?>
957+
<feed xmlns="http://www.w3.org/2005/Atom">
958+
<entry>
959+
<title>Licensed Entry</title>
960+
<id>urn:uuid:1</id>
961+
<link rel="license" href="https://creativecommons.org/licenses/by-sa/3.0/"/>
962+
<link rel="alternate" href="https://example.com/entry/1"/>
963+
</entry>
964+
</feed>"#;
965+
966+
let feed = parse_atom10(xml).unwrap();
967+
assert_eq!(feed.entries.len(), 1);
968+
assert_eq!(
969+
feed.entries[0].license.as_deref(),
970+
Some("https://creativecommons.org/licenses/by-sa/3.0/")
971+
);
972+
assert_eq!(
973+
feed.entries[0].link.as_deref(),
974+
Some("https://example.com/entry/1")
975+
);
976+
}
929977
}

crates/feedparser-rs-core/src/parser/common.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,47 @@ pub fn extract_xml_base(
285285
.map(|s| s.to_string())
286286
}
287287

288+
/// Extract xml:lang attribute from element
289+
///
290+
/// Returns the language code if xml:lang or lang attribute exists.
291+
/// Respects `max_attribute_length` limit for `DoS` protection.
292+
///
293+
/// # Arguments
294+
///
295+
/// * `element` - The XML element to extract xml:lang from
296+
/// * `max_attr_length` - Maximum allowed attribute length (`DoS` protection)
297+
///
298+
/// # Returns
299+
///
300+
/// * `Some(String)` - The xml:lang value if found and within length limit
301+
/// * `None` - If attribute not found or exceeds length limit
302+
///
303+
/// # Examples
304+
///
305+
/// ```ignore
306+
/// use feedparser_rs::parser::common::extract_xml_lang;
307+
///
308+
/// let element = /* BytesStart from quick-xml */;
309+
/// if let Some(lang) = extract_xml_lang(&element, 1024) {
310+
/// println!("Language: {}", lang);
311+
/// }
312+
/// ```
313+
pub fn extract_xml_lang(
314+
element: &quick_xml::events::BytesStart,
315+
max_attr_length: usize,
316+
) -> Option<String> {
317+
element
318+
.attributes()
319+
.flatten()
320+
.find(|attr| {
321+
let key = attr.key.as_ref();
322+
key == b"xml:lang" || key == b"lang"
323+
})
324+
.filter(|attr| attr.value.len() <= max_attr_length)
325+
.and_then(|attr| attr.unescape_value().ok())
326+
.map(|s| s.to_string())
327+
}
328+
288329
/// Read text content from current XML element (handles text and CDATA)
289330
pub fn read_text(
290331
reader: &mut Reader<&[u8]>,

0 commit comments

Comments
 (0)