Merge pull request #2913 from blacklanternsecurity/excavate-ignore-pdf

TheTechromancer · web-flow · commit 052e88e22363 · 2026-02-25T11:01:54.000-05:00
Don't excavate PDFs
diff --git a/bbot/modules/internal/excavate.py b/bbot/modules/internal/excavate.py
@@ -1242,6 +1242,14 @@ async def handle_event(self, event, **kwargs):
                     if header.lower() == "content-type":
                         content_type = headers["content-type"][0]
 
+            # skip PDF responses -- running YARA/regex on raw PDF bytes produces false positives and wastes time.
+            # PDFs are still processed correctly via the filedownload → extractous → RAW_TEXT pipeline,
+            # which extracts readable text and feeds it back to excavate as a RAW_TEXT event (handled separately below).
+            # TODO: remove this in favor of a proper categorization system for text vs non-text (i.e. to-be-extracted) content
+            if content_type and "application/pdf" in content_type.lower():
+                self.debug(f"Skipping PDF response: {event.data.get('url', 'unknown')}")
+                return
+
             await self.search(
                 body,
                 event,
diff --git a/bbot/test/test_step_2/module_tests/test_module_excavate.py b/bbot/test/test_step_2/module_tests/test_module_excavate.py
@@ -1491,3 +1491,31 @@ async def setup_before_prep(self, module_test):
     def check(self, module_test, events):
         # Verify we got the hostname
         assert any(e.data == "asdffoo.test.notreal" for e in events)
+
+
+class TestExcavateIgnorePDF(ModuleTestBase):
+    targets = ["http://127.0.0.1:8888/"]
+    modules_overrides = ["excavate", "httpx"]
+
+    # body content that would normally produce findings if processed
+    pdf_body_with_urls = "https://pdf-extracted.test.notreal/some/path ftp://ftp.test.notreal"
+
+    async def setup_after_prep(self, module_test):
+        module_test.set_expect_requests(
+            {"uri": "/"},
+            {"response_data": self.pdf_body_with_urls, "headers": {"Content-Type": "application/pdf"}},
+        )
+
+    def check(self, module_test, events):
+        # excavate should skip PDF responses entirely, so no URLs or findings should be extracted from the body
+        url_unverified_events = [
+            e for e in events if e.type == "URL_UNVERIFIED" and "pdf-extracted.test.notreal" in e.data
+        ]
+        assert len(url_unverified_events) == 0, (
+            f"PDF body should not be processed by excavate, but got: {url_unverified_events}"
+        )
+
+        ftp_findings = [
+            e for e in events if e.type == "FINDING" and "ftp://ftp.test.notreal" in e.data.get("description", "")
+        ]
+        assert len(ftp_findings) == 0, f"PDF body should not produce findings, but got: {ftp_findings}"