Skip to content

Commit 727d887

Browse files
committed
feat(py): add auto-URL detection to parse() for feedparser compatibility
The parse() function now automatically detects URLs (http://, https://) and fetches them when the http feature is enabled. This matches Python feedparser's behavior where parse() accepts both URLs and content. Changes: - Add optional etag, modified, user_agent params to parse() - Add optional HTTP params to parse_with_limits() - Create internal parse_internal() for shared URL/content logic - URL detection based on http:// and https:// prefix - When http feature disabled, return NotImplementedError for URLs - Update existing tests to use keyword args for limits param
1 parent 5c88930 commit 727d887

File tree

3 files changed

+229
-22
lines changed

3 files changed

+229
-22
lines changed

crates/feedparser-rs-py/src/lib.rs

Lines changed: 117 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -40,39 +40,136 @@ fn _feedparser_rs(m: &Bound<'_, PyModule>) -> PyResult<()> {
4040
Ok(())
4141
}
4242

43-
/// Parse an RSS/Atom/JSON Feed from bytes or string
43+
/// Parse an RSS/Atom/JSON Feed from bytes, string, or URL
44+
///
45+
/// Automatically detects whether `source` is a URL (http://, https://) or content.
46+
/// For URLs, fetches and parses the feed. For content, parses directly.
47+
///
48+
/// # Arguments
49+
///
50+
/// * `source` - URL string, feed content string, or bytes
51+
/// * `etag` - Optional ETag from previous fetch (for URLs with conditional GET)
52+
/// * `modified` - Optional Last-Modified timestamp (for URLs with conditional GET)
53+
/// * `user_agent` - Optional custom User-Agent header (for URLs)
54+
///
55+
/// # Examples
56+
///
57+
/// ```python
58+
/// import feedparser_rs
59+
///
60+
/// # Parse from URL (auto-detected)
61+
/// feed = feedparser_rs.parse("https://example.com/feed.xml")
62+
///
63+
/// # Parse from content
64+
/// feed = feedparser_rs.parse("<rss>...</rss>")
65+
///
66+
/// # Parse from URL with caching
67+
/// feed = feedparser_rs.parse(
68+
/// "https://example.com/feed.xml",
69+
/// etag=cached_etag,
70+
/// modified=cached_modified
71+
/// )
72+
/// ```
4473
#[pyfunction]
45-
#[pyo3(signature = (source, /))]
46-
fn parse(py: Python<'_>, source: &Bound<'_, PyAny>) -> PyResult<PyParsedFeed> {
47-
parse_with_limits(py, source, None)
74+
#[pyo3(signature = (source, /, etag=None, modified=None, user_agent=None))]
75+
fn parse(
76+
py: Python<'_>,
77+
source: &Bound<'_, PyAny>,
78+
etag: Option<&str>,
79+
modified: Option<&str>,
80+
user_agent: Option<&str>,
81+
) -> PyResult<PyParsedFeed> {
82+
parse_internal(py, source, etag, modified, user_agent, None)
4883
}
4984

5085
/// Parse with custom resource limits for DoS protection
86+
///
87+
/// Like `parse()` but allows specifying custom limits for untrusted feeds.
88+
///
89+
/// # Arguments
90+
///
91+
/// * `source` - URL string, feed content string, or bytes
92+
/// * `etag` - Optional ETag from previous fetch (for URLs)
93+
/// * `modified` - Optional Last-Modified timestamp (for URLs)
94+
/// * `user_agent` - Optional custom User-Agent header (for URLs)
95+
/// * `limits` - Optional parser limits for DoS protection
96+
///
97+
/// # Examples
98+
///
99+
/// ```python
100+
/// import feedparser_rs
101+
///
102+
/// limits = feedparser_rs.ParserLimits.strict()
103+
///
104+
/// # Parse from URL with limits
105+
/// feed = feedparser_rs.parse_with_limits(
106+
/// "https://example.com/feed.xml",
107+
/// limits=limits
108+
/// )
109+
///
110+
/// # Parse from content with limits
111+
/// feed = feedparser_rs.parse_with_limits("<rss>...</rss>", limits=limits)
112+
/// ```
51113
#[pyfunction]
52-
#[pyo3(signature = (source, limits=None))]
114+
#[pyo3(signature = (source, /, etag=None, modified=None, user_agent=None, limits=None))]
53115
fn parse_with_limits(
54116
py: Python<'_>,
55117
source: &Bound<'_, PyAny>,
118+
etag: Option<&str>,
119+
modified: Option<&str>,
120+
user_agent: Option<&str>,
56121
limits: Option<&PyParserLimits>,
57122
) -> PyResult<PyParsedFeed> {
58-
let bytes: Vec<u8> = if let Ok(s) = source.extract::<String>() {
123+
parse_internal(py, source, etag, modified, user_agent, limits)
124+
}
125+
126+
/// Internal parse function that handles both URL and content sources
127+
fn parse_internal(
128+
py: Python<'_>,
129+
source: &Bound<'_, PyAny>,
130+
etag: Option<&str>,
131+
modified: Option<&str>,
132+
user_agent: Option<&str>,
133+
limits: Option<&PyParserLimits>,
134+
) -> PyResult<PyParsedFeed> {
135+
// Try to extract as string first
136+
if let Ok(s) = source.extract::<String>() {
137+
// Check if it's a URL
59138
if s.starts_with("http://") || s.starts_with("https://") {
60-
return Err(pyo3::exceptions::PyNotImplementedError::new_err(
61-
"URL fetching not implemented. Use requests.get(url).content",
62-
));
139+
// Handle URL - requires http feature
140+
#[cfg(feature = "http")]
141+
{
142+
let parser_limits = limits.map(|l| l.to_core_limits()).unwrap_or_default();
143+
let parsed =
144+
core::parse_url_with_limits(&s, etag, modified, user_agent, parser_limits)
145+
.map_err(convert_feed_error)?;
146+
return PyParsedFeed::from_core(py, parsed);
147+
}
148+
#[cfg(not(feature = "http"))]
149+
{
150+
return Err(pyo3::exceptions::PyNotImplementedError::new_err(
151+
"URL fetching requires the 'http' feature. Build with: maturin develop --features http",
152+
));
153+
}
63154
}
64-
s.into_bytes()
65-
} else if let Ok(b) = source.extract::<Vec<u8>>() {
66-
b
67-
} else {
68-
return Err(pyo3::exceptions::PyTypeError::new_err(
69-
"source must be str or bytes",
70-
));
71-
};
72155

73-
let parser_limits = limits.map(|l| l.to_core_limits()).unwrap_or_default();
74-
let parsed = core::parse_with_limits(&bytes, parser_limits).map_err(convert_feed_error)?;
75-
PyParsedFeed::from_core(py, parsed)
156+
// Parse as content
157+
let parser_limits = limits.map(|l| l.to_core_limits()).unwrap_or_default();
158+
let parsed =
159+
core::parse_with_limits(s.as_bytes(), parser_limits).map_err(convert_feed_error)?;
160+
return PyParsedFeed::from_core(py, parsed);
161+
}
162+
163+
// Try to extract as bytes
164+
if let Ok(b) = source.extract::<Vec<u8>>() {
165+
let parser_limits = limits.map(|l| l.to_core_limits()).unwrap_or_default();
166+
let parsed = core::parse_with_limits(&b, parser_limits).map_err(convert_feed_error)?;
167+
return PyParsedFeed::from_core(py, parsed);
168+
}
169+
170+
Err(pyo3::exceptions::PyTypeError::new_err(
171+
"source must be str, bytes, or URL",
172+
))
76173
}
77174

78175
/// Detect feed format without full parsing

crates/feedparser-rs-py/tests/test_basic.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def test_parse_with_limits():
137137
max_entries=10,
138138
)
139139

140-
d = feedparser_rs.parse_with_limits(xml, limits)
140+
d = feedparser_rs.parse_with_limits(xml, limits=limits)
141141
assert d.version == "rss20"
142142

143143

@@ -150,7 +150,7 @@ def test_parse_with_limits_exceeded():
150150
)
151151

152152
with pytest.raises(ValueError, match="exceeds maximum"):
153-
feedparser_rs.parse_with_limits(xml, limits)
153+
feedparser_rs.parse_with_limits(xml, limits=limits)
154154

155155

156156
def test_detect_format_rss20():

crates/feedparser-rs-py/tests/test_compat.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,3 +627,113 @@ def test_dict_access_list_fields():
627627
assert len(entry['links']) >= 1
628628
assert len(entry['tags']) == 1
629629
assert entry['tags'][0].term == "rust"
630+
631+
632+
# =============================================================================
633+
# Phase 4: Auto-URL Detection Tests
634+
# =============================================================================
635+
636+
637+
def test_parse_with_optional_http_params():
638+
"""Test that parse() accepts optional HTTP parameters for URL fetching"""
639+
# When parsing content (not URL), these params should be ignored
640+
xml = """<rss version="2.0">
641+
<channel>
642+
<title>Test Feed</title>
643+
</channel>
644+
</rss>"""
645+
646+
# Should work with optional params (they're just ignored for content)
647+
feed = feedparser_rs.parse(xml, etag="some-etag", modified="some-date")
648+
assert feed.feed.title == "Test Feed"
649+
assert feed.version == 'rss20'
650+
651+
652+
def test_parse_with_user_agent_param():
653+
"""Test that parse() accepts user_agent parameter"""
654+
xml = """<rss version="2.0">
655+
<channel>
656+
<title>Test Feed</title>
657+
</channel>
658+
</rss>"""
659+
660+
# Should work with user_agent param (ignored for content)
661+
feed = feedparser_rs.parse(xml, user_agent="TestBot/1.0")
662+
assert feed.feed.title == "Test Feed"
663+
664+
665+
def test_parse_url_detection_http():
666+
"""Test that parse() detects http:// URLs"""
667+
# This test verifies URL detection logic without actually fetching
668+
# Since we don't have an HTTP feature enabled or a real server,
669+
# we just verify the parse function signature accepts URL-like strings
670+
try:
671+
# This will either succeed (if http feature enabled and server exists)
672+
# or raise NotImplementedError (if http feature disabled)
673+
feedparser_rs.parse("http://example.com/nonexistent")
674+
except NotImplementedError as e:
675+
# http feature not enabled - this is expected
676+
assert "http" in str(e).lower()
677+
except Exception:
678+
# Some other error (network, etc.) - also acceptable
679+
pass
680+
681+
682+
def test_parse_url_detection_https():
683+
"""Test that parse() detects https:// URLs"""
684+
try:
685+
feedparser_rs.parse("https://example.com/nonexistent")
686+
except NotImplementedError as e:
687+
# http feature not enabled - this is expected
688+
assert "http" in str(e).lower()
689+
except Exception:
690+
# Some other error (network, etc.) - also acceptable
691+
pass
692+
693+
694+
def test_parse_content_starting_with_http_in_text():
695+
"""Test that content containing 'http' as text is not treated as URL"""
696+
# This should be parsed as content, not as a URL
697+
xml = """<rss version="2.0">
698+
<channel>
699+
<title>HTTP Guide</title>
700+
<description>Learn about http protocol</description>
701+
</channel>
702+
</rss>"""
703+
704+
feed = feedparser_rs.parse(xml)
705+
assert feed.feed.title == "HTTP Guide"
706+
assert "http" in feed.feed.subtitle.lower()
707+
708+
709+
def test_parse_bytes_content():
710+
"""Test that bytes content is still parsed correctly"""
711+
xml = b"""<rss version="2.0">
712+
<channel>
713+
<title>Bytes Feed</title>
714+
</channel>
715+
</rss>"""
716+
717+
feed = feedparser_rs.parse(xml)
718+
assert feed.feed.title == "Bytes Feed"
719+
720+
721+
def test_parse_with_limits_accepts_http_params():
722+
"""Test that parse_with_limits() also accepts HTTP parameters"""
723+
xml = """<rss version="2.0">
724+
<channel>
725+
<title>Test Feed</title>
726+
</channel>
727+
</rss>"""
728+
729+
limits = feedparser_rs.ParserLimits()
730+
731+
# Should work with all optional params
732+
feed = feedparser_rs.parse_with_limits(
733+
xml,
734+
etag="etag",
735+
modified="modified",
736+
user_agent="TestBot/1.0",
737+
limits=limits
738+
)
739+
assert feed.feed.title == "Test Feed"

0 commit comments

Comments
 (0)