Skip to content

Commit 3a0a341

Browse files
Add test for HTML escaping in generated scraper code (#105)
Ensures that AI-generated Python code containing HTML-like sequences (</script>, </code>, </pre>) is properly escaped when rendered in the admin partial. This prevents XSS and JavaScript parsing errors. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude <noreply@anthropic.com>
1 parent b17462b commit 3a0a341

File tree

1 file changed

+33
-0
lines changed

1 file changed

+33
-0
lines changed

backend/tests/test_admin.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,39 @@ def test_generate_scraper_ai_not_available(self, mock_available, admin_client, d
801801
assert response.status_code == 400
802802
assert "not available" in response.text.lower() or "api" in response.text.lower()
803803

804+
@patch("app.routers.admin.generate_scraper_for_url")
805+
@patch("app.routers.admin.is_ai_analysis_available")
806+
def test_generated_scraper_escapes_html_in_code(self, mock_available, mock_generate, admin_client, db, active_source):
807+
"""Generated code with HTML-like content should be escaped to prevent XSS/parsing errors."""
808+
from app.services.ai_analyzer import GeneratedScraper
809+
810+
mock_available.return_value = True
811+
812+
# Code containing sequences that would break HTML if not escaped
813+
malicious_code = '''class TestScraper(BaseScraper):
814+
def parse(self):
815+
html = "</script><script>alert('xss')</script>"
816+
code = "</code></pre><div>injected</div>"
817+
return html
818+
'''
819+
mock_generate.return_value = GeneratedScraper(
820+
success=True,
821+
code=malicious_code,
822+
class_name="TestScraper"
823+
)
824+
825+
response = admin_client.post(f"/admin/sources/{active_source.id}/generate-scraper")
826+
assert response.status_code == 200
827+
828+
# The response should contain HTML-escaped versions
829+
assert "&lt;/script&gt;" in response.text
830+
assert "&lt;/code&gt;" in response.text
831+
assert "&lt;/pre&gt;" in response.text
832+
833+
# Raw HTML-breaking sequences should NOT appear
834+
assert "</script><script>" not in response.text
835+
assert "</code></pre>" not in response.text
836+
804837

805838
class TestUrlNormalization:
806839
"""Tests for _normalize_url function used in duplicate detection."""

0 commit comments

Comments
 (0)