Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
82d05a7
add parameter emmision to wayback
liquidsec Feb 17, 2026
855db97
mods to the wayback parameter extraction
liquidsec Feb 17, 2026
7f8c645
more features / bug fixes for new wayback
liquidsec Feb 18, 2026
560f47c
allow from-wayback tag to propagate
liquidsec Feb 18, 2026
de1c851
update docs for wayback
liquidsec Feb 18, 2026
0020cda
add waf string 4xx filtering
liquidsec Feb 18, 2026
c791b98
add Akamai WAF string to waf_strings helper
liquidsec Feb 18, 2026
5d7363f
add directory listing excavate submodule
liquidsec Feb 18, 2026
202af81
improve wayback CDX error logging and increase timeout
liquidsec Feb 18, 2026
5978644
add rate limiting, retry, and bloom filter dedup to wayback archive f…
liquidsec Feb 19, 2026
c58babe
add CDX server-side filters and 100k URL limit to wayback module
liquidsec Feb 19, 2026
14cbb11
fixing wayback rate limiting
liquidsec Feb 19, 2026
b51aaab
improving wayback delay system
liquidsec Feb 19, 2026
d8b17a8
make cpu heavy processing non-blocking
liquidsec Feb 19, 2026
f506101
make max_records configurable, fix archive retry logic, demote log level
liquidsec Feb 19, 2026
0201f69
Merge branch 'dev' into wayback-upgrade
liquidsec Feb 19, 2026
cdf1000
Merge branch 'wayback-upgrade' into paddingoracle-fix
liquidsec Feb 20, 2026
addd832
Merge pull request #2912 from blacklanternsecurity/paddingoracle-fix
liquidsec Feb 20, 2026
2a77064
fix _event_host() using resolved IP instead of URL hostname
liquidsec Feb 20, 2026
f150610
skip URL collapse when there are no URLs to process
liquidsec Feb 21, 2026
67bbdf1
ruff format
liquidsec Feb 23, 2026
ccb1c94
add timeout and recovery protections to run_in_executor_mp
liquidsec Feb 23, 2026
72fab96
Merge branch 'dev' into wayback-upgrade
liquidsec Feb 25, 2026
231ddf0
Merge branch 'dev' into wayback-upgrade
liquidsec Feb 25, 2026
93df9dd
ruff check fixes
liquidsec Feb 25, 2026
8adc83f
Fix 3.0 merge compatibility: whitelist removal, FINDING schema, test …
liquidsec Feb 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion bbot/core/event/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,7 @@ def parent(self, parent):
self.web_spider_distance = getattr(parent, "web_spider_distance", 0)
event_has_url = getattr(self, "parsed_url", None) is not None
for t in parent.tags:
if t in ("affiliate",):
if t in ("affiliate", "from-wayback"):
self.add_tag(t)
elif t.startswith("mutation-"):
self.add_tag(t)
Expand Down Expand Up @@ -1050,6 +1050,19 @@ def _data_load(self, data):


class DictHostEvent(DictEvent):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# inherit archive_url from parent for provenance tracking (e.g. wayback archived content)
if isinstance(self.data, dict) and "archive_url" not in self.data:
parent = self.parent
if (
parent is not None
and parent is not self
and isinstance(parent.data, dict)
and "archive_url" in parent.data
):
self.data["archive_url"] = parent.data["archive_url"]

def _host(self):
if isinstance(self.data, dict) and "host" in self.data:
return make_ip_type(self.data["host"])
Expand Down Expand Up @@ -1576,6 +1589,7 @@ class _data_validator(BaseModel):
confidence: str
url: Optional[str] = None
path: Optional[str] = None
archive_url: Optional[str] = None
cves: Optional[list[str]] = None
_validate_url = field_validator("url")(validators.validate_url)
_validate_host = field_validator("host")(validators.validate_host)
Expand Down
24 changes: 19 additions & 5 deletions bbot/core/helpers/helper.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import os
import sys
import asyncio
import logging
from pathlib import Path
import multiprocessing as mp
Expand Down Expand Up @@ -81,7 +83,12 @@ def __init__(self, preset):
# we spawn 1 fewer processes than cores
# this helps to avoid locking up the system or competing with the main python process for cpu time
num_processes = max(1, mp.cpu_count() - 1)
self.process_pool = ProcessPoolExecutor(max_workers=num_processes)
pool_kwargs = {"max_workers": num_processes}
# max_tasks_per_child replaces workers after N tasks, preventing memory leaks
# and reducing the chance of a degraded worker process causing hangs
if sys.version_info >= (3, 11):
pool_kwargs["max_tasks_per_child"] = 25
self.process_pool = ProcessPoolExecutor(**pool_kwargs)

self._cloud = None

Expand Down Expand Up @@ -198,17 +205,24 @@ def run_in_executor(self, callback, *args, **kwargs):
callback = partial(callback, **kwargs)
return self.loop.run_in_executor(None, callback, *args)

def run_in_executor_mp(self, callback, *args, **kwargs):
async def run_in_executor_mp(self, callback, *args, **kwargs):
"""
Same as run_in_executor() except with a process pool executor
Use only in cases where callback is CPU-bound
Same as run_in_executor() except with a process pool executor.
Use only in cases where callback is CPU-bound.

Includes a timeout (default 300s) to prevent indefinite hangs if a
child process dies or the pool enters a broken state.

Pass ``_timeout=seconds`` to override the default timeout.

Examples:
Execute callback:
>>> result = await self.helpers.run_in_executor_mp(callback_fn, arg1, arg2)
"""
timeout = kwargs.pop("_timeout", 300)
callback = partial(callback, **kwargs)
return self.loop.run_in_executor(self.process_pool, callback, *args)
future = self.loop.run_in_executor(self.process_pool, callback, *args)
return await asyncio.wait_for(future, timeout=timeout)

@property
def in_tests(self):
Expand Down
19 changes: 19 additions & 0 deletions bbot/core/helpers/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2621,6 +2621,25 @@ async def as_completed(coros):
yield task


def get_waf_strings():
"""
Returns a list of common WAF (Web Application Firewall) detection strings.

Returns:
list: List of WAF detection strings

Examples:
>>> waf_strings = get_waf_strings()
>>> "The requested URL was rejected" in waf_strings
True
"""
return [
"The requested URL was rejected",
"This content has been blocked",
"You don't have permission to access ",
]


def clean_dns_record(record):
"""
Cleans and formats a given DNS record for further processing.
Expand Down
5 changes: 5 additions & 0 deletions bbot/core/helpers/web/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,11 @@ async def _acatch(self, url, raise_error):
raise
else:
log.debug(f"HTTP connect failed to URL: {url}")
except httpx.ReadError as e:
if raise_error:
raise
else:
log.verbose(f"HTTP read error for URL: {url}: {e}")
except httpx.HTTPError as e:
if raise_error:
raise
Expand Down
1 change: 1 addition & 0 deletions bbot/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ parameter_blacklist:
- ASP.NET_SessionId
- PHPSESSID
- __cf_bm
- _cfuvid
- f5_cspm

parameter_blacklist_prefixes:
Expand Down
4 changes: 1 addition & 3 deletions bbot/modules/bypass403.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,6 @@
"X-Host": "127.0.0.1",
}

# This is planned to be replaced in the future: https://github.com/blacklanternsecurity/bbot/issues/1068
waf_strings = ["The requested URL was rejected"]

for qp in query_payloads:
signatures.append(("GET", "{scheme}://{netloc}/{path}%s" % qp, None, True))
Expand Down Expand Up @@ -107,7 +105,7 @@ async def do_checks(self, compare_helper, event, collapse_threshold):

# In some cases WAFs will respond with a 200 code which causes a false positive
if subject_response is not None:
for ws in waf_strings:
for ws in self.helpers.get_waf_strings():
if ws in subject_response.text:
self.debug("Rejecting result based on presence of WAF string")
return
Expand Down
7 changes: 7 additions & 0 deletions bbot/modules/httpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,13 @@ async def handle_batch(self, *events):
self.debug(f'Discarding 404 from "{url}"')
continue

# discard 4xx responses that contain WAF strings
if 400 <= status_code < 500:
body = j.get("body", "")
if any(ws in body for ws in self.helpers.get_waf_strings()):
self.debug(f'Discarding WAF {status_code} from "{url}"')
continue

# main URL
tags = [f"status-{status_code}"]
httpx_ip = j.get("host", "")
Expand Down
91 changes: 79 additions & 12 deletions bbot/modules/internal/excavate.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,42 @@ def in_bl(self, value):

return False

def _is_archived(self, event):
"""Check if an event represents archived wayback content."""
return isinstance(event.data, dict) and "archive_url" in event.data

def _event_host(self, event):
"""Get the effective host from an event.

For archived wayback content, data["host"] contains the original target hostname
(since data["url"] points to archive.org). For regular events, we use event.host.

NOTE: Regular HTTP_RESPONSE events also have data["host"], but it contains the
resolved IP from the httpx binary — NOT a hostname override.
"""
if self._is_archived(event) and event.data.get("host"):
return str(event.data["host"])
return str(event.host)

def _event_base_url(self, event):
"""Get the effective base URL from an event.

For archived wayback content, reconstructs the original URL from override fields
(host/scheme/port/path) since parsed_url points to archive.org.
For regular events, returns event.parsed_url directly.
"""
if not self._is_archived(event):
return event.parsed_url
scheme = event.data.get("scheme", event.parsed_url.scheme)
host = self._event_host(event)
port = event.data.get("port")
if port is not None:
port = int(port)
if not ((scheme == "http" and port == 80) or (scheme == "https" and port == 443)):
host = f"{host}:{port}"
path = event.data.get("path", event.parsed_url.path)
return urlparse(f"{scheme}://{host}{path}")

def url_unparse(self, param_type, parsed_url):
# Reconstructs a URL, optionally omitting the query string based on remove_querystring configuration value.
if param_type == "GETPARAM":
Expand Down Expand Up @@ -638,8 +674,9 @@ async def process(self, yara_results, event, yara_rule_settings, discovery_conte

# The endpoint is usually a form action - we should use it if we have it. If not, default to URL.
else:
# Use the original URL as the base and resolve the endpoint correctly in case of relative paths
base_url = f"{event.parsed_url.scheme}://{event.parsed_url.netloc}{event.parsed_url.path}"
# Use the effective base URL (which may differ from parsed_url for archived content)
event_base = self.excavate._event_base_url(event)
base_url = f"{event_base.scheme}://{event_base.netloc}{event_base.path}"
if not self.excavate.remove_querystring and len(event.parsed_url.query) > 0:
base_url += f"?{event.parsed_url.query}"
url = urljoin(base_url, endpoint)
Expand Down Expand Up @@ -970,6 +1007,34 @@ async def process(self, yara_results, event, yara_rule_settings, discovery_conte
if yara_results:
event.add_tag("login-page")

class DirectoryListingExtractor(ExcavateRule):
description = "Detects directory listing pages from web servers."
signatures = {
"Apache_Nginx": '"<title>Index of /"',
"IIS": '"[To Parent Directory]"',
"Python_HTTP_Server": '"<h1>Directory listing for"',
"Generic_Directory_Listing": '"<title>Directory Listing"',
}
yara_rules = {}

def __init__(self, excavate):
super().__init__(excavate)
signature_component_list = []
for signature_name, signature in self.signatures.items():
signature_component_list.append(rf"${signature_name} = {signature}")
signature_component = " ".join(signature_component_list)
self.yara_rules["directory_listing"] = (
f'rule directory_listing {{meta: description = "contains a directory listing" strings: {signature_component} condition: any of them}}'
)

async def process(self, yara_results, event, yara_rule_settings, discovery_context):
for identifier in yara_results.keys():
for findings in yara_results[identifier]:
event_data = {
"description": f"{discovery_context} {yara_rule_settings.description} ({identifier})"
}
await self.report(event_data, event, yara_rule_settings, discovery_context, event_type="FINDING")

def add_yara_rule(self, rule_name, rule_content, rule_instance):
rule_instance.name = rule_name
self.yara_rules_dict[rule_name] = rule_content
Expand Down Expand Up @@ -997,12 +1062,13 @@ async def emit_custom_parameters(self, event, config_key, param_type, descriptio
# Emits WEB_PARAMETER events for custom headers and cookies from the configuration.
custom_params = self.scan.web_config.get(config_key, {})
for param_name, param_value in custom_params.items():
event_base = self._event_base_url(event)
await self.emit_web_parameter(
host=event.parsed_url.hostname,
host=self._event_host(event),
param_type=param_type,
name=param_name,
original_value=param_value,
url=self.url_unparse(param_type, event.parsed_url),
url=self.url_unparse(param_type, event_base),
description=f"HTTP Extracted Parameter [{param_name}] ({description_suffix})",
additional_params=_exclude_key(custom_params, param_name),
event=event,
Expand Down Expand Up @@ -1116,15 +1182,15 @@ async def search(self, data, event, content_type, discovery_context="HTTP respon
if results:
for parameter_name, original_value in results:
await self.emit_web_parameter(
host=str(event.host),
host=self._event_host(event),
param_type="SPECULATIVE",
name=parameter_name,
original_value=original_value,
url=str(event.data["url"]),
url=self._event_base_url(event).geturl(),
description=f"HTTP Extracted Parameter (speculative from {source_type} content) [{parameter_name}]",
additional_params={},
event=event,
context=f"excavate's Parameter extractor found a speculative WEB_PARAMETER: {parameter_name} by parsing {source_type} data from {str(event.host)}",
context=f"excavate's Parameter extractor found a speculative WEB_PARAMETER: {parameter_name} by parsing {source_type} data from {self._event_host(event)}",
)
return

Expand Down Expand Up @@ -1176,7 +1242,7 @@ async def handle_event(self, event, **kwargs):
) in extract_params_url(event.parsed_url):
if self.in_bl(parameter_name) is False:
await self.emit_web_parameter(
host=parsed_url.hostname,
host=self._event_host(event),
param_type="GETPARAM",
name=parameter_name,
original_value=original_value,
Expand Down Expand Up @@ -1210,12 +1276,13 @@ async def handle_event(self, event, **kwargs):

if self.in_bl(cookie_name) is False:
self.assigned_cookies[cookie_name] = cookie_value
event_base = self._event_base_url(event)
await self.emit_web_parameter(
host=str(event.host),
host=self._event_host(event),
param_type="COOKIE",
name=cookie_name,
original_value=cookie_value,
url=self.url_unparse("COOKIE", event.parsed_url),
url=self.url_unparse("COOKIE", event_base),
description=f"Set-Cookie Assigned Cookie [{cookie_name}]",
additional_params={},
event=event,
Expand Down Expand Up @@ -1252,10 +1319,10 @@ async def handle_event(self, event, **kwargs):
original_value,
regex_name,
additional_params,
) in extract_params_location(header_value, event.parsed_url):
) in extract_params_location(header_value, self._event_base_url(event)):
if self.in_bl(parameter_name) is False:
await self.emit_web_parameter(
host=parsed_url.hostname,
host=self._event_host(event),
param_type="GETPARAM",
name=parameter_name,
original_value=original_value,
Expand Down
4 changes: 3 additions & 1 deletion bbot/modules/templates/subdomain_enum.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,10 @@ async def filter_event(self, event):
# reject if it's a cloud resource and not in our target (unless it's a seed event)
if is_cloud and not self.scan.in_target(event) and "seed" not in event.tags:
return False, "Event is a cloud resource and not a direct target"
# don't reject targets — if the user explicitly targeted a domain, always process it
is_target = self.scan.in_target(event)
# optionally reject events with wildcards / errors
if self.reject_wildcards:
if self.reject_wildcards and not is_target:
if any(t in event.tags for t in ("a-error", "aaaa-error")):
return False, "Event has a DNS resolution error"
if self.reject_wildcards == "strict":
Expand Down
Loading
Loading