Skip to content

Commit 052e88e

Browse files
Merge pull request #2913 from blacklanternsecurity/excavate-ignore-pdf
Don't excavate PDFs
2 parents 7bb10ba + 7d69afd commit 052e88e

File tree

2 files changed

+36
-0
lines changed

2 files changed

+36
-0
lines changed

bbot/modules/internal/excavate.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1242,6 +1242,14 @@ async def handle_event(self, event, **kwargs):
12421242
if header.lower() == "content-type":
12431243
content_type = headers["content-type"][0]
12441244

1245+
# skip PDF responses -- running YARA/regex on raw PDF bytes produces false positives and wastes time.
1246+
# PDFs are still processed correctly via the filedownload → extractous → RAW_TEXT pipeline,
1247+
# which extracts readable text and feeds it back to excavate as a RAW_TEXT event (handled separately below).
1248+
# TODO: remove this in favor of a proper categorization system for text vs non-text (i.e. to-be-extracted) content
1249+
if content_type and "application/pdf" in content_type.lower():
1250+
self.debug(f"Skipping PDF response: {event.data.get('url', 'unknown')}")
1251+
return
1252+
12451253
await self.search(
12461254
body,
12471255
event,

bbot/test/test_step_2/module_tests/test_module_excavate.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1491,3 +1491,31 @@ async def setup_before_prep(self, module_test):
14911491
def check(self, module_test, events):
14921492
# Verify we got the hostname
14931493
assert any(e.data == "asdffoo.test.notreal" for e in events)
1494+
1495+
1496+
class TestExcavateIgnorePDF(ModuleTestBase):
1497+
targets = ["http://127.0.0.1:8888/"]
1498+
modules_overrides = ["excavate", "httpx"]
1499+
1500+
# body content that would normally produce findings if processed
1501+
pdf_body_with_urls = "https://pdf-extracted.test.notreal/some/path ftp://ftp.test.notreal"
1502+
1503+
async def setup_after_prep(self, module_test):
1504+
module_test.set_expect_requests(
1505+
{"uri": "/"},
1506+
{"response_data": self.pdf_body_with_urls, "headers": {"Content-Type": "application/pdf"}},
1507+
)
1508+
1509+
def check(self, module_test, events):
1510+
# excavate should skip PDF responses entirely, so no URLs or findings should be extracted from the body
1511+
url_unverified_events = [
1512+
e for e in events if e.type == "URL_UNVERIFIED" and "pdf-extracted.test.notreal" in e.data
1513+
]
1514+
assert len(url_unverified_events) == 0, (
1515+
f"PDF body should not be processed by excavate, but got: {url_unverified_events}"
1516+
)
1517+
1518+
ftp_findings = [
1519+
e for e in events if e.type == "FINDING" and "ftp://ftp.test.notreal" in e.data.get("description", "")
1520+
]
1521+
assert len(ftp_findings) == 0, f"PDF body should not produce findings, but got: {ftp_findings}"

0 commit comments

Comments
 (0)