Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 35 additions & 8 deletions crates/feedparser-rs-core/src/parser/atom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@ use crate::{
Content, Entry, FeedVersion, Generator, Link, MediaContent, MediaThumbnail, ParsedFeed,
Person, Source, Tag, TextConstruct, TextType,
},
util::parse_date,
util::{base_url::BaseUrlContext, parse_date},
};
use quick_xml::{Reader, events::Event};

use super::common::{
EVENT_BUFFER_CAPACITY, FromAttributes, LimitedCollectionExt, bytes_to_string, check_depth,
init_feed, is_content_tag, is_dc_tag, is_media_tag, read_text, skip_element, skip_to_end,
extract_xml_base, init_feed, is_content_tag, is_dc_tag, is_media_tag, read_text, skip_element,
skip_to_end,
};

/// Parse Atom 1.0 feed from raw bytes
Expand Down Expand Up @@ -63,12 +64,19 @@ pub fn parse_atom10_with_limits(data: &[u8], limits: ParserLimits) -> Result<Par
let mut feed = init_feed(FeedVersion::Atom10, limits.max_entries);
let mut buf = Vec::with_capacity(EVENT_BUFFER_CAPACITY);
let mut depth: usize = 1;
let mut base_ctx = BaseUrlContext::new();

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) if e.local_name().as_ref() == b"feed" => {
if let Some(xml_base) = extract_xml_base(&e, limits.max_attribute_length) {
base_ctx.update_base(&xml_base);
}

depth += 1;
if let Err(e) = parse_feed_element(&mut reader, &mut feed, &limits, &mut depth) {
if let Err(e) =
parse_feed_element(&mut reader, &mut feed, &limits, &mut depth, &base_ctx)
{
feed.bozo = true;
feed.bozo_exception = Some(e.to_string());
}
Expand All @@ -95,6 +103,7 @@ fn parse_feed_element(
feed: &mut ParsedFeed,
limits: &ParserLimits,
depth: &mut usize,
base_ctx: &BaseUrlContext,
) -> Result<()> {
let mut buf = Vec::with_capacity(EVENT_BUFFER_CAPACITY);

Expand All @@ -117,10 +126,12 @@ fn parse_feed_element(
feed.feed.set_title(text);
}
b"link" => {
if let Some(link) = Link::from_attributes(
if let Some(mut link) = Link::from_attributes(
element.attributes().flatten(),
limits.max_attribute_length,
) {
link.href = base_ctx.resolve_safe(&link.href);

if feed.feed.link.is_none() && link.rel.as_deref() == Some("alternate")
{
feed.feed.link = Some(link.href.clone());
Expand All @@ -144,6 +155,10 @@ fn parse_feed_element(
let text = read_text(reader, &mut buf, limits)?;
feed.feed.updated = parse_date(&text);
}
b"published" if !is_empty => {
let text = read_text(reader, &mut buf, limits)?;
feed.feed.published = parse_date(&text);
}
b"author" if !is_empty => {
if let Ok(person) = parse_person(reader, &mut buf, limits, depth) {
if feed.feed.author.is_none() {
Expand Down Expand Up @@ -177,10 +192,12 @@ fn parse_feed_element(
feed.feed.set_generator(generator);
}
b"icon" if !is_empty => {
feed.feed.icon = Some(read_text(reader, &mut buf, limits)?);
let url = read_text(reader, &mut buf, limits)?;
feed.feed.icon = Some(base_ctx.resolve_safe(&url));
}
b"logo" if !is_empty => {
feed.feed.logo = Some(read_text(reader, &mut buf, limits)?);
let url = read_text(reader, &mut buf, limits)?;
feed.feed.logo = Some(base_ctx.resolve_safe(&url));
}
b"rights" if !is_empty => {
let text = parse_text_construct(reader, &mut buf, &element, limits)?;
Expand All @@ -191,7 +208,14 @@ fn parse_feed_element(
continue;
}

match parse_entry(reader, &mut buf, limits, depth) {
let mut entry_ctx = base_ctx.child();
if let Some(xml_base) =
extract_xml_base(&element, limits.max_attribute_length)
{
entry_ctx.update_base(&xml_base);
}

match parse_entry(reader, &mut buf, limits, depth, &entry_ctx) {
Ok(entry) => feed.entries.push(entry),
Err(e) => {
feed.bozo = true;
Expand Down Expand Up @@ -249,6 +273,7 @@ fn parse_entry(
buf: &mut Vec<u8>,
limits: &ParserLimits,
depth: &mut usize,
base_ctx: &BaseUrlContext,
) -> Result<Entry> {
let mut entry = Entry::with_capacity();

Expand All @@ -271,10 +296,12 @@ fn parse_entry(
entry.set_title(text);
}
b"link" => {
if let Some(link) = Link::from_attributes(
if let Some(mut link) = Link::from_attributes(
element.attributes().flatten(),
limits.max_attribute_length,
) {
link.href = base_ctx.resolve_safe(&link.href);

if entry.link.is_none() && link.rel.as_deref() == Some("alternate") {
entry.link = Some(link.href.clone());
}
Expand Down
41 changes: 41 additions & 0 deletions crates/feedparser-rs-core/src/parser/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,47 @@ pub fn is_itunes_tag(name: &[u8], tag: &[u8]) -> bool {
name == tag
}

/// Extract xml:base attribute from element
///
/// Returns the base URL string if xml:base attribute exists.
/// Respects `max_attribute_length` limit for `DoS` protection.
///
/// # Arguments
///
/// * `element` - The XML element to extract xml:base from
/// * `max_attr_length` - Maximum allowed attribute length (`DoS` protection)
///
/// # Returns
///
/// * `Some(String)` - The xml:base value if found and within length limit
/// * `None` - If attribute not found or exceeds length limit
///
/// # Examples
///
/// ```ignore
/// use feedparser_rs::parser::common::extract_xml_base;
///
/// let element = /* BytesStart from quick-xml */;
/// if let Some(base) = extract_xml_base(&element, 1024) {
/// println!("Base URL: {}", base);
/// }
/// ```
pub fn extract_xml_base(
element: &quick_xml::events::BytesStart,
max_attr_length: usize,
) -> Option<String> {
element
.attributes()
.flatten()
.find(|attr| {
let key = attr.key.as_ref();
key == b"xml:base" || key == b"base"
})
.filter(|attr| attr.value.len() <= max_attr_length)
.and_then(|attr| attr.unescape_value().ok())
.map(|s| s.to_string())
}

/// Read text content from current XML element (handles text and CDATA)
pub fn read_text(
reader: &mut Reader<&[u8]>,
Expand Down
37 changes: 25 additions & 12 deletions crates/feedparser-rs-core/src/parser/rss.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::{
PodcastPerson, PodcastTranscript, Source, Tag, TextConstruct, TextType, parse_duration,
parse_explicit,
},
util::{parse_date, text::truncate_to_length},
util::{base_url::BaseUrlContext, parse_date, text::truncate_to_length},
};
use quick_xml::{Reader, events::Event};

Expand Down Expand Up @@ -105,12 +105,15 @@ pub fn parse_rss20_with_limits(data: &[u8], limits: ParserLimits) -> Result<Pars
let mut feed = init_feed(FeedVersion::Rss20, limits.max_entries);
let mut buf = Vec::with_capacity(EVENT_BUFFER_CAPACITY);
let mut depth: usize = 1;
let mut base_ctx = BaseUrlContext::new();

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) if e.local_name().as_ref() == b"channel" => {
depth += 1;
if let Err(e) = parse_channel(&mut reader, &mut feed, &limits, &mut depth) {
if let Err(e) =
parse_channel(&mut reader, &mut feed, &limits, &mut depth, &mut base_ctx)
{
feed.bozo = true;
feed.bozo_exception = Some(e.to_string());
}
Expand All @@ -136,6 +139,7 @@ fn parse_channel(
feed: &mut ParsedFeed,
limits: &ParserLimits,
depth: &mut usize,
base_ctx: &mut BaseUrlContext,
) -> Result<()> {
let mut buf = Vec::with_capacity(EVENT_BUFFER_CAPACITY);

Expand All @@ -159,7 +163,7 @@ fn parse_channel(
match tag.as_slice() {
b"title" | b"link" | b"description" | b"language" | b"pubDate"
| b"managingEditor" | b"webMaster" | b"generator" | b"ttl" | b"category" => {
parse_channel_standard(reader, &mut buf, &tag, feed, limits)?;
parse_channel_standard(reader, &mut buf, &tag, feed, limits, base_ctx)?;
}
b"image" => {
if let Ok(image) = parse_image(reader, &mut buf, limits, depth) {
Expand All @@ -171,7 +175,7 @@ fn parse_channel(
continue;
}

match parse_item(reader, &mut buf, limits, depth) {
match parse_item(reader, &mut buf, limits, depth, base_ctx) {
Ok((entry, has_attr_errors)) => {
if has_attr_errors {
feed.bozo = true;
Expand Down Expand Up @@ -256,6 +260,7 @@ fn parse_channel_standard(
tag: &[u8],
feed: &mut ParsedFeed,
limits: &ParserLimits,
base_ctx: &mut BaseUrlContext,
) -> Result<()> {
match tag {
b"title" => {
Expand All @@ -264,7 +269,11 @@ fn parse_channel_standard(
b"link" => {
let link_text = read_text(reader, buf, limits)?;
feed.feed
.set_alternate_link(link_text, limits.max_links_per_feed);
.set_alternate_link(link_text.clone(), limits.max_links_per_feed);

if base_ctx.base().is_none() {
base_ctx.update_base(&link_text);
}
}
b"description" => {
feed.feed.subtitle = Some(read_text(reader, buf, limits)?);
Expand All @@ -275,7 +284,7 @@ fn parse_channel_standard(
b"pubDate" => {
let text = read_text(reader, buf, limits)?;
match parse_date(&text) {
Some(dt) => feed.feed.updated = Some(dt),
Some(dt) => feed.feed.published = Some(dt),
None if !text.is_empty() => {
feed.bozo = true;
feed.bozo_exception = Some("Invalid pubDate format".to_string());
Expand Down Expand Up @@ -506,6 +515,7 @@ fn parse_item(
buf: &mut Vec<u8>,
limits: &ParserLimits,
depth: &mut usize,
base_ctx: &BaseUrlContext,
) -> Result<(Entry, bool)> {
let mut entry = Entry::with_capacity();
let mut has_attr_errors = false;
Expand Down Expand Up @@ -534,10 +544,11 @@ fn parse_item(
match tag.as_slice() {
b"title" | b"link" | b"description" | b"guid" | b"pubDate" | b"author"
| b"category" | b"comments" => {
parse_item_standard(reader, buf, &tag, &mut entry, limits)?;
parse_item_standard(reader, buf, &tag, &mut entry, limits, base_ctx)?;
}
b"enclosure" => {
if let Some(enclosure) = parse_enclosure(&attrs, limits) {
if let Some(mut enclosure) = parse_enclosure(&attrs, limits) {
enclosure.url = base_ctx.resolve_safe(&enclosure.url);
entry
.enclosures
.try_push_limited(enclosure, limits.max_enclosures);
Expand Down Expand Up @@ -591,17 +602,19 @@ fn parse_item_standard(
tag: &[u8],
entry: &mut Entry,
limits: &ParserLimits,
base_ctx: &BaseUrlContext,
) -> Result<()> {
match tag {
b"title" => {
entry.title = Some(read_text(reader, buf, limits)?);
}
b"link" => {
let link_text = read_text(reader, buf, limits)?;
entry.link = Some(link_text.clone());
let resolved_link = base_ctx.resolve_safe(&link_text);
entry.link = Some(resolved_link.clone());
entry.links.try_push_limited(
Link {
href: link_text,
href: resolved_link,
rel: Some("alternate".to_string()),
..Default::default()
},
Expand Down Expand Up @@ -1109,10 +1122,10 @@ mod tests {
</rss>"#;

let feed = parse_rss20(xml).unwrap();
assert!(feed.feed.updated.is_some());
assert!(feed.feed.published.is_some());
assert!(feed.entries[0].published.is_some());

let dt = feed.feed.updated.unwrap();
let dt = feed.feed.published.unwrap();
assert_eq!(dt.year(), 2024);
assert_eq!(dt.month(), 12);
assert_eq!(dt.day(), 14);
Expand Down
2 changes: 2 additions & 0 deletions crates/feedparser-rs-core/src/types/feed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ pub struct FeedMeta {
pub subtitle_detail: Option<TextConstruct>,
/// Last update date
pub updated: Option<DateTime<Utc>>,
/// Initial publication date (RSS pubDate, Atom published)
pub published: Option<DateTime<Utc>>,
/// Primary author name
pub author: Option<String>,
/// Detailed author information
Expand Down
Loading