@@ -1491,3 +1491,31 @@ async def setup_before_prep(self, module_test):
14911491 def check (self , module_test , events ):
14921492 # Verify we got the hostname
14931493 assert any (e .data == "asdffoo.test.notreal" for e in events )
1494+
1495+
1496+ class TestExcavateIgnorePDF (ModuleTestBase ):
1497+ targets = ["http://127.0.0.1:8888/" ]
1498+ modules_overrides = ["excavate" , "httpx" ]
1499+
1500+ # body content that would normally produce findings if processed
1501+ pdf_body_with_urls = "https://pdf-extracted.test.notreal/some/path ftp://ftp.test.notreal"
1502+
1503+ async def setup_after_prep (self , module_test ):
1504+ module_test .set_expect_requests (
1505+ {"uri" : "/" },
1506+ {"response_data" : self .pdf_body_with_urls , "headers" : {"Content-Type" : "application/pdf" }},
1507+ )
1508+
1509+ def check (self , module_test , events ):
1510+ # excavate should skip PDF responses entirely, so no URLs or findings should be extracted from the body
1511+ url_unverified_events = [
1512+ e for e in events if e .type == "URL_UNVERIFIED" and "pdf-extracted.test.notreal" in e .data
1513+ ]
1514+ assert len (url_unverified_events ) == 0 , (
1515+ f"PDF body should not be processed by excavate, but got: { url_unverified_events } "
1516+ )
1517+
1518+ ftp_findings = [
1519+ e for e in events if e .type == "FINDING" and "ftp://ftp.test.notreal" in e .data .get ("description" , "" )
1520+ ]
1521+ assert len (ftp_findings ) == 0 , f"PDF body should not produce findings, but got: { ftp_findings } "
0 commit comments