feat(py): add auto-URL detection to parse() for feedparser compatibility

bug-ops · bug-ops · commit 727d88713afb · 2025-12-28T22:02:32.000+01:00
The parse() function now automatically detects URLs (http://, https://)
and fetches them when the http feature is enabled. This matches Python
feedparser's behavior where parse() accepts both URLs and content.

Changes:
- Add optional etag, modified, user_agent params to parse()
- Add optional HTTP params to parse_with_limits()
- Create internal parse_internal() for shared URL/content logic
- URL detection based on http:// and https:// prefix
- When http feature disabled, return NotImplementedError for URLs
- Update existing tests to use keyword args for limits param
diff --git a/crates/feedparser-rs-py/src/lib.rs b/crates/feedparser-rs-py/src/lib.rs
@@ -40,39 +40,136 @@ fn _feedparser_rs(m: &Bound<'_, PyModule>) -> PyResult<()> {
     Ok(())
 }
 
-/// Parse an RSS/Atom/JSON Feed from bytes or string
+/// Parse an RSS/Atom/JSON Feed from bytes, string, or URL
+///
+/// Automatically detects whether `source` is a URL (http://, https://) or content.
+/// For URLs, fetches and parses the feed. For content, parses directly.
+///
+/// # Arguments
+///
+/// * `source` - URL string, feed content string, or bytes
+/// * `etag` - Optional ETag from previous fetch (for URLs with conditional GET)
+/// * `modified` - Optional Last-Modified timestamp (for URLs with conditional GET)
+/// * `user_agent` - Optional custom User-Agent header (for URLs)
+///
+/// # Examples
+///
+/// ```python
+/// import feedparser_rs
+///
+/// # Parse from URL (auto-detected)
+/// feed = feedparser_rs.parse("https://example.com/feed.xml")
+///
+/// # Parse from content
+/// feed = feedparser_rs.parse("<rss>...</rss>")
+///
+/// # Parse from URL with caching
+/// feed = feedparser_rs.parse(
+///     "https://example.com/feed.xml",
+///     etag=cached_etag,
+///     modified=cached_modified
+/// )
+/// ```
 #[pyfunction]
-#[pyo3(signature = (source, /))]
-fn parse(py: Python<'_>, source: &Bound<'_, PyAny>) -> PyResult<PyParsedFeed> {
-    parse_with_limits(py, source, None)
+#[pyo3(signature = (source, /, etag=None, modified=None, user_agent=None))]
+fn parse(
+    py: Python<'_>,
+    source: &Bound<'_, PyAny>,
+    etag: Option<&str>,
+    modified: Option<&str>,
+    user_agent: Option<&str>,
+) -> PyResult<PyParsedFeed> {
+    parse_internal(py, source, etag, modified, user_agent, None)
 }
 
 /// Parse with custom resource limits for DoS protection
+///
+/// Like `parse()` but allows specifying custom limits for untrusted feeds.
+///
+/// # Arguments
+///
+/// * `source` - URL string, feed content string, or bytes
+/// * `etag` - Optional ETag from previous fetch (for URLs)
+/// * `modified` - Optional Last-Modified timestamp (for URLs)
+/// * `user_agent` - Optional custom User-Agent header (for URLs)
+/// * `limits` - Optional parser limits for DoS protection
+///
+/// # Examples
+///
+/// ```python
+/// import feedparser_rs
+///
+/// limits = feedparser_rs.ParserLimits.strict()
+///
+/// # Parse from URL with limits
+/// feed = feedparser_rs.parse_with_limits(
+///     "https://example.com/feed.xml",
+///     limits=limits
+/// )
+///
+/// # Parse from content with limits
+/// feed = feedparser_rs.parse_with_limits("<rss>...</rss>", limits=limits)
+/// ```
 #[pyfunction]
-#[pyo3(signature = (source, limits=None))]
+#[pyo3(signature = (source, /, etag=None, modified=None, user_agent=None, limits=None))]
 fn parse_with_limits(
     py: Python<'_>,
     source: &Bound<'_, PyAny>,
+    etag: Option<&str>,
+    modified: Option<&str>,
+    user_agent: Option<&str>,
     limits: Option<&PyParserLimits>,
 ) -> PyResult<PyParsedFeed> {
-    let bytes: Vec<u8> = if let Ok(s) = source.extract::<String>() {
+    parse_internal(py, source, etag, modified, user_agent, limits)
+}
+
+/// Internal parse function that handles both URL and content sources
+fn parse_internal(
+    py: Python<'_>,
+    source: &Bound<'_, PyAny>,
+    etag: Option<&str>,
+    modified: Option<&str>,
+    user_agent: Option<&str>,
+    limits: Option<&PyParserLimits>,
+) -> PyResult<PyParsedFeed> {
+    // Try to extract as string first
+    if let Ok(s) = source.extract::<String>() {
+        // Check if it's a URL
         if s.starts_with("http://") || s.starts_with("https://") {
-            return Err(pyo3::exceptions::PyNotImplementedError::new_err(
-                "URL fetching not implemented. Use requests.get(url).content",
-            ));
+            // Handle URL - requires http feature
+            #[cfg(feature = "http")]
+            {
+                let parser_limits = limits.map(|l| l.to_core_limits()).unwrap_or_default();
+                let parsed =
+                    core::parse_url_with_limits(&s, etag, modified, user_agent, parser_limits)
+                        .map_err(convert_feed_error)?;
+                return PyParsedFeed::from_core(py, parsed);
+            }
+            #[cfg(not(feature = "http"))]
+            {
+                return Err(pyo3::exceptions::PyNotImplementedError::new_err(
+                    "URL fetching requires the 'http' feature. Build with: maturin develop --features http",
+                ));
+            }
         }
-        s.into_bytes()
-    } else if let Ok(b) = source.extract::<Vec<u8>>() {
-        b
-    } else {
-        return Err(pyo3::exceptions::PyTypeError::new_err(
-            "source must be str or bytes",
-        ));
-    };
 
-    let parser_limits = limits.map(|l| l.to_core_limits()).unwrap_or_default();
-    let parsed = core::parse_with_limits(&bytes, parser_limits).map_err(convert_feed_error)?;
-    PyParsedFeed::from_core(py, parsed)
+        // Parse as content
+        let parser_limits = limits.map(|l| l.to_core_limits()).unwrap_or_default();
+        let parsed =
+            core::parse_with_limits(s.as_bytes(), parser_limits).map_err(convert_feed_error)?;
+        return PyParsedFeed::from_core(py, parsed);
+    }
+
+    // Try to extract as bytes
+    if let Ok(b) = source.extract::<Vec<u8>>() {
+        let parser_limits = limits.map(|l| l.to_core_limits()).unwrap_or_default();
+        let parsed = core::parse_with_limits(&b, parser_limits).map_err(convert_feed_error)?;
+        return PyParsedFeed::from_core(py, parsed);
+    }
+
+    Err(pyo3::exceptions::PyTypeError::new_err(
+        "source must be str, bytes, or URL",
+    ))
 }
 
 /// Detect feed format without full parsing
diff --git a/crates/feedparser-rs-py/tests/test_basic.py b/crates/feedparser-rs-py/tests/test_basic.py
@@ -137,7 +137,7 @@ def test_parse_with_limits():
         max_entries=10,
     )
 
-    d = feedparser_rs.parse_with_limits(xml, limits)
+    d = feedparser_rs.parse_with_limits(xml, limits=limits)
     assert d.version == "rss20"
 
 
@@ -150,7 +150,7 @@ def test_parse_with_limits_exceeded():
     )
 
     with pytest.raises(ValueError, match="exceeds maximum"):
-        feedparser_rs.parse_with_limits(xml, limits)
+        feedparser_rs.parse_with_limits(xml, limits=limits)
 
 
 def test_detect_format_rss20():
diff --git a/crates/feedparser-rs-py/tests/test_compat.py b/crates/feedparser-rs-py/tests/test_compat.py
@@ -627,3 +627,113 @@ def test_dict_access_list_fields():
     assert len(entry['links']) >= 1
     assert len(entry['tags']) == 1
     assert entry['tags'][0].term == "rust"
+
+
+# =============================================================================
+# Phase 4: Auto-URL Detection Tests
+# =============================================================================
+
+
+def test_parse_with_optional_http_params():
+    """Test that parse() accepts optional HTTP parameters for URL fetching"""
+    # When parsing content (not URL), these params should be ignored
+    xml = """<rss version="2.0">
+        <channel>
+            <title>Test Feed</title>
+        </channel>
+    </rss>"""
+
+    # Should work with optional params (they're just ignored for content)
+    feed = feedparser_rs.parse(xml, etag="some-etag", modified="some-date")
+    assert feed.feed.title == "Test Feed"
+    assert feed.version == 'rss20'
+
+
+def test_parse_with_user_agent_param():
+    """Test that parse() accepts user_agent parameter"""
+    xml = """<rss version="2.0">
+        <channel>
+            <title>Test Feed</title>
+        </channel>
+    </rss>"""
+
+    # Should work with user_agent param (ignored for content)
+    feed = feedparser_rs.parse(xml, user_agent="TestBot/1.0")
+    assert feed.feed.title == "Test Feed"
+
+
+def test_parse_url_detection_http():
+    """Test that parse() detects http:// URLs"""
+    # This test verifies URL detection logic without actually fetching
+    # Since we don't have an HTTP feature enabled or a real server,
+    # we just verify the parse function signature accepts URL-like strings
+    try:
+        # This will either succeed (if http feature enabled and server exists)
+        # or raise NotImplementedError (if http feature disabled)
+        feedparser_rs.parse("http://example.com/nonexistent")
+    except NotImplementedError as e:
+        # http feature not enabled - this is expected
+        assert "http" in str(e).lower()
+    except Exception:
+        # Some other error (network, etc.) - also acceptable
+        pass
+
+
+def test_parse_url_detection_https():
+    """Test that parse() detects https:// URLs"""
+    try:
+        feedparser_rs.parse("https://example.com/nonexistent")
+    except NotImplementedError as e:
+        # http feature not enabled - this is expected
+        assert "http" in str(e).lower()
+    except Exception:
+        # Some other error (network, etc.) - also acceptable
+        pass
+
+
+def test_parse_content_starting_with_http_in_text():
+    """Test that content containing 'http' as text is not treated as URL"""
+    # This should be parsed as content, not as a URL
+    xml = """<rss version="2.0">
+        <channel>
+            <title>HTTP Guide</title>
+            <description>Learn about http protocol</description>
+        </channel>
+    </rss>"""
+
+    feed = feedparser_rs.parse(xml)
+    assert feed.feed.title == "HTTP Guide"
+    assert "http" in feed.feed.subtitle.lower()
+
+
+def test_parse_bytes_content():
+    """Test that bytes content is still parsed correctly"""
+    xml = b"""<rss version="2.0">
+        <channel>
+            <title>Bytes Feed</title>
+        </channel>
+    </rss>"""
+
+    feed = feedparser_rs.parse(xml)
+    assert feed.feed.title == "Bytes Feed"
+
+
+def test_parse_with_limits_accepts_http_params():
+    """Test that parse_with_limits() also accepts HTTP parameters"""
+    xml = """<rss version="2.0">
+        <channel>
+            <title>Test Feed</title>
+        </channel>
+    </rss>"""
+
+    limits = feedparser_rs.ParserLimits()
+
+    # Should work with all optional params
+    feed = feedparser_rs.parse_with_limits(
+        xml,
+        etag="etag",
+        modified="modified",
+        user_agent="TestBot/1.0",
+        limits=limits
+    )
+    assert feed.feed.title == "Test Feed"

Original file line number	Diff line number	Diff line change
`@@ -137,7 +137,7 @@ def test_parse_with_limits():`
`137`	`137`	`max_entries=10,`
`138`	`138`	`)`
`139`	`139`
`140`		`- d = feedparser_rs.parse_with_limits(xml, limits)`
	`140`	`+ d = feedparser_rs.parse_with_limits(xml, limits=limits)`
`141`	`141`	`assert d.version == "rss20"`
`142`	`142`
`143`	`143`
`@@ -150,7 +150,7 @@ def test_parse_with_limits_exceeded():`
`150`	`150`	`)`
`151`	`151`
`152`	`152`	`with pytest.raises(ValueError, match="exceeds maximum"):`
`153`		`- feedparser_rs.parse_with_limits(xml, limits)`
	`153`	`+ feedparser_rs.parse_with_limits(xml, limits=limits)`
`154`	`154`
`155`	`155`
`156`	`156`	`def test_detect_format_rss20():`