Skip to content

Commit e8b720b

Browse files
authored
Merge pull request #18 from bug-ops/feature/feedparser-parity
feat: RSS 1.0 parser, xml:base support, enhanced encoding detection
2 parents b27fc98 + 980e377 commit e8b720b

File tree

12 files changed

+1625
-217
lines changed

12 files changed

+1625
-217
lines changed

crates/feedparser-rs-core/src/parser/atom.rs

Lines changed: 6 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ use crate::{
1313
use quick_xml::{Reader, events::Event};
1414

1515
use super::common::{
16-
EVENT_BUFFER_CAPACITY, FromAttributes, LimitedCollectionExt, bytes_to_string, init_feed,
17-
read_text, skip_element, skip_to_end,
16+
EVENT_BUFFER_CAPACITY, FromAttributes, LimitedCollectionExt, bytes_to_string, check_depth,
17+
init_feed, is_content_tag, is_dc_tag, is_media_tag, read_text, skip_element, skip_to_end,
1818
};
1919

2020
/// Parse Atom 1.0 feed from raw bytes
@@ -107,12 +107,7 @@ fn parse_feed_element(
107107
};
108108

109109
*depth += 1;
110-
if *depth > limits.max_nesting_depth {
111-
return Err(FeedError::InvalidFormat(format!(
112-
"XML nesting depth {depth} exceeds maximum {}",
113-
limits.max_nesting_depth
114-
)));
115-
}
110+
check_depth(*depth, limits.max_nesting_depth)?;
116111

117112
let element = e.to_owned();
118113
// Use name() instead of local_name() to preserve namespace prefixes
@@ -266,12 +261,7 @@ fn parse_entry(
266261
};
267262

268263
*depth += 1;
269-
if *depth > limits.max_nesting_depth {
270-
return Err(FeedError::InvalidFormat(format!(
271-
"XML nesting depth {depth} exceeds maximum {}",
272-
limits.max_nesting_depth
273-
)));
274-
}
264+
check_depth(*depth, limits.max_nesting_depth)?;
275265

276266
let element = e.to_owned();
277267
// Use name() instead of local_name() to preserve namespace prefixes
@@ -468,12 +458,7 @@ fn parse_person(
468458
match reader.read_event_into(buf) {
469459
Ok(Event::Start(e)) => {
470460
*depth += 1;
471-
if *depth > limits.max_nesting_depth {
472-
return Err(FeedError::InvalidFormat(format!(
473-
"XML nesting depth {} exceeds maximum {}",
474-
depth, limits.max_nesting_depth
475-
)));
476-
}
461+
check_depth(*depth, limits.max_nesting_depth)?;
477462

478463
match e.local_name().as_ref() {
479464
b"name" => name = Some(read_text(reader, buf, limits)?),
@@ -568,12 +553,7 @@ fn parse_atom_source(
568553
match reader.read_event_into(buf) {
569554
Ok(Event::Start(e) | Event::Empty(e)) => {
570555
*depth += 1;
571-
if *depth > limits.max_nesting_depth {
572-
return Err(FeedError::InvalidFormat(format!(
573-
"XML nesting depth {} exceeds maximum {}",
574-
depth, limits.max_nesting_depth
575-
)));
576-
}
556+
check_depth(*depth, limits.max_nesting_depth)?;
577557

578558
let element = e.to_owned();
579559
// Use name() instead of local_name() to preserve namespace prefixes
@@ -605,36 +585,6 @@ fn parse_atom_source(
605585
Ok(Source { title, link, id })
606586
}
607587

608-
/// Check if element name matches a Dublin Core namespace tag
609-
#[inline]
610-
fn is_dc_tag(name: &[u8]) -> Option<&str> {
611-
if name.starts_with(b"dc:") {
612-
std::str::from_utf8(&name[3..]).ok()
613-
} else {
614-
None
615-
}
616-
}
617-
618-
/// Check if element name matches a Content namespace tag
619-
#[inline]
620-
fn is_content_tag(name: &[u8]) -> Option<&str> {
621-
if name.starts_with(b"content:") {
622-
std::str::from_utf8(&name[8..]).ok()
623-
} else {
624-
None
625-
}
626-
}
627-
628-
/// Check if element name matches a Media RSS namespace tag
629-
#[inline]
630-
fn is_media_tag(name: &[u8]) -> Option<&str> {
631-
if name.starts_with(b"media:") {
632-
std::str::from_utf8(&name[6..]).ok()
633-
} else {
634-
None
635-
}
636-
}
637-
638588
#[cfg(test)]
639589
mod tests {
640590
use super::*;

crates/feedparser-rs-core/src/parser/common.rs

Lines changed: 89 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,9 +145,7 @@ pub fn init_feed(version: FeedVersion, max_entries: usize) -> ParsedFeed {
145145
/// Check nesting depth and return error if exceeded
146146
///
147147
/// This is a standalone helper for parsers that don't use `ParseContext`.
148-
/// Future use: Will be used when `ParseContext` is adopted project-wide
149148
#[inline]
150-
#[allow(dead_code)]
151149
pub fn check_depth(depth: usize, max_depth: usize) -> Result<()> {
152150
if depth > max_depth {
153151
return Err(FeedError::InvalidFormat(format!(
@@ -157,6 +155,95 @@ pub fn check_depth(depth: usize, max_depth: usize) -> Result<()> {
157155
Ok(())
158156
}
159157

158+
/// Extract local name from namespaced element if prefix matches
159+
///
160+
/// Validates tag name contains only alphanumeric characters and hyphens
161+
/// to prevent injection attacks.
162+
///
163+
/// # Examples
164+
///
165+
/// ```ignore
166+
/// assert_eq!(extract_ns_local_name(b"dc:creator", b"dc:"), Some("creator"));
167+
/// assert_eq!(extract_ns_local_name(b"dc:creator", b"atom:"), None);
168+
/// assert_eq!(extract_ns_local_name(b"dc:<script>", b"dc:"), None); // Invalid chars
169+
/// ```
170+
#[inline]
171+
pub fn extract_ns_local_name<'a>(name: &'a [u8], prefix: &[u8]) -> Option<&'a str> {
172+
if name.starts_with(prefix) {
173+
let tag_name = std::str::from_utf8(&name[prefix.len()..]).ok()?;
174+
// Security: validate tag name (alphanumeric + hyphen only)
175+
if !tag_name.is_empty() && tag_name.chars().all(|c| c.is_alphanumeric() || c == '-') {
176+
Some(tag_name)
177+
} else {
178+
None
179+
}
180+
} else {
181+
None
182+
}
183+
}
184+
185+
/// Check if element is a Dublin Core namespaced tag
186+
///
187+
/// # Examples
188+
///
189+
/// ```ignore
190+
/// assert_eq!(is_dc_tag(b"dc:creator"), Some("creator"));
191+
/// assert_eq!(is_dc_tag(b"dc:subject"), Some("subject"));
192+
/// assert_eq!(is_dc_tag(b"content:encoded"), None);
193+
/// ```
194+
#[inline]
195+
pub fn is_dc_tag(name: &[u8]) -> Option<&str> {
196+
extract_ns_local_name(name, b"dc:")
197+
}
198+
199+
/// Check if element is a Content namespaced tag
200+
///
201+
/// # Examples
202+
///
203+
/// ```ignore
204+
/// assert_eq!(is_content_tag(b"content:encoded"), Some("encoded"));
205+
/// assert_eq!(is_content_tag(b"dc:creator"), None);
206+
/// ```
207+
#[inline]
208+
pub fn is_content_tag(name: &[u8]) -> Option<&str> {
209+
extract_ns_local_name(name, b"content:")
210+
}
211+
212+
/// Check if element is a Media RSS namespaced tag
213+
///
214+
/// # Examples
215+
///
216+
/// ```ignore
217+
/// assert_eq!(is_media_tag(b"media:thumbnail"), Some("thumbnail"));
218+
/// assert_eq!(is_media_tag(b"media:content"), Some("content"));
219+
/// assert_eq!(is_media_tag(b"dc:creator"), None);
220+
/// ```
221+
#[inline]
222+
pub fn is_media_tag(name: &[u8]) -> Option<&str> {
223+
extract_ns_local_name(name, b"media:")
224+
}
225+
226+
/// Check if element matches an iTunes namespace tag
227+
///
228+
/// Supports both prefixed (itunes:author) and unprefixed (author) forms
229+
/// for compatibility with non-compliant feeds.
230+
///
231+
/// # Examples
232+
///
233+
/// ```ignore
234+
/// assert!(is_itunes_tag(b"itunes:author", b"author"));
235+
/// assert!(is_itunes_tag(b"author", b"author")); // Fallback for non-prefixed
236+
/// assert!(!is_itunes_tag(b"itunes:title", b"author"));
237+
/// ```
238+
#[inline]
239+
pub fn is_itunes_tag(name: &[u8], tag: &[u8]) -> bool {
240+
if name.starts_with(b"itunes:") && &name[7..] == tag {
241+
return true;
242+
}
243+
// Fallback for feeds without prefix
244+
name == tag
245+
}
246+
160247
/// Read text content from current XML element (handles text and CDATA)
161248
pub fn read_text(
162249
reader: &mut Reader<&[u8]>,

crates/feedparser-rs-core/src/parser/json.rs

Lines changed: 11 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use crate::{
99
Content, Enclosure, Entry, FeedMeta, FeedVersion, Image, LimitedCollectionExt, Link,
1010
ParseFrom, ParsedFeed, Person, Tag, TextConstruct,
1111
},
12-
util::date::parse_date,
12+
util::{date::parse_date, text::truncate_to_length},
1313
};
1414
use serde_json::Value;
1515

@@ -76,7 +76,7 @@ pub fn parse_json_feed_with_limits(data: &[u8], limits: ParserLimits) -> Result<
7676

7777
fn parse_feed_metadata(json: &Value, feed: &mut FeedMeta, limits: &ParserLimits) {
7878
if let Some(title) = json.get("title").and_then(|v| v.as_str()) {
79-
let truncated = truncate_text(title, limits.max_text_length);
79+
let truncated = truncate_to_length(title, limits.max_text_length);
8080
feed.set_title(TextConstruct::text(&truncated));
8181
}
8282

@@ -94,7 +94,7 @@ fn parse_feed_metadata(json: &Value, feed: &mut FeedMeta, limits: &ParserLimits)
9494
}
9595

9696
if let Some(description) = json.get("description").and_then(|v| v.as_str()) {
97-
let truncated = truncate_text(description, limits.max_text_length);
97+
let truncated = truncate_to_length(description, limits.max_text_length);
9898
feed.subtitle_detail = Some(TextConstruct::text(&truncated));
9999
feed.subtitle = Some(truncated);
100100
}
@@ -160,26 +160,26 @@ fn parse_item(json: &Value, limits: &ParserLimits) -> Entry {
160160
}
161161

162162
if let Some(title) = json.get("title").and_then(|v| v.as_str()) {
163-
let truncated = truncate_text(title, limits.max_text_length);
163+
let truncated = truncate_to_length(title, limits.max_text_length);
164164
entry.set_title(TextConstruct::text(&truncated));
165165
}
166166

167167
if let Some(content_html) = json.get("content_html").and_then(|v| v.as_str()) {
168-
let text = truncate_text(content_html, limits.max_text_length);
168+
let text = truncate_to_length(content_html, limits.max_text_length);
169169
let _ = entry
170170
.content
171171
.try_push_limited(Content::html(text), limits.max_entries);
172172
}
173173

174174
if let Some(content_text) = json.get("content_text").and_then(|v| v.as_str()) {
175-
let text = truncate_text(content_text, limits.max_text_length);
175+
let text = truncate_to_length(content_text, limits.max_text_length);
176176
let _ = entry
177177
.content
178178
.try_push_limited(Content::plain(text), limits.max_entries);
179179
}
180180

181181
if let Some(summary) = json.get("summary").and_then(|v| v.as_str()) {
182-
let truncated = truncate_text(summary, limits.max_text_length);
182+
let truncated = truncate_to_length(summary, limits.max_text_length);
183183
entry.set_summary(TextConstruct::text(&truncated));
184184
}
185185

@@ -265,15 +265,6 @@ fn parse_authors(
265265
}
266266
}
267267

268-
/// Truncate text to maximum length
269-
fn truncate_text(text: &str, max_length: usize) -> String {
270-
if text.len() <= max_length {
271-
text.to_string()
272-
} else {
273-
text.chars().take(max_length).collect()
274-
}
275-
}
276-
277268
#[cfg(test)]
278269
mod tests {
279270
use super::*;
@@ -496,9 +487,9 @@ mod tests {
496487
}
497488

498489
#[test]
499-
fn test_truncate_text() {
500-
assert_eq!(truncate_text("hello", 10), "hello");
501-
assert_eq!(truncate_text("hello world", 5), "hello");
502-
assert_eq!(truncate_text("", 10), "");
490+
fn test_truncate_to_length() {
491+
assert_eq!(truncate_to_length("hello", 10), "hello");
492+
assert_eq!(truncate_to_length("hello world", 5), "hello");
493+
assert_eq!(truncate_to_length("", 10), "");
503494
}
504495
}

crates/feedparser-rs-core/src/parser/mod.rs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ mod detect;
44
pub mod json;
55
pub mod namespace_detection;
66
pub mod rss;
7+
pub mod rss10;
78

89
use crate::{error::Result, types::ParsedFeed};
910

@@ -63,7 +64,6 @@ pub fn parse(data: &[u8]) -> Result<ParsedFeed> {
6364
/// - Format is unknown or unsupported
6465
/// - Fatal parsing error occurs
6566
pub fn parse_with_limits(data: &[u8], limits: crate::ParserLimits) -> Result<ParsedFeed> {
66-
use crate::FeedError;
6767
use crate::types::FeedVersion;
6868

6969
// Detect format
@@ -79,10 +79,8 @@ pub fn parse_with_limits(data: &[u8], limits: crate::ParserLimits) -> Result<Par
7979
// Atom variants
8080
FeedVersion::Atom10 | FeedVersion::Atom03 => atom::parse_atom10_with_limits(data, limits),
8181

82-
// RSS 1.0 (RDF) - TODO: Phase 3
83-
FeedVersion::Rss10 => Err(FeedError::InvalidFormat(
84-
"RSS 1.0 not yet supported (Phase 3)".to_string(),
85-
)),
82+
// RSS 1.0 (RDF)
83+
FeedVersion::Rss10 => rss10::parse_rss10_with_limits(data, limits),
8684

8785
// JSON Feed
8886
FeedVersion::JsonFeed10 | FeedVersion::JsonFeed11 => {

0 commit comments

Comments
 (0)