blacklanternsecurity · liquidsec · Feb 17, 2026 · Feb 17, 2026 · Feb 18, 2026 · Feb 18, 2026
diff --git a/bbot/core/event/base.py b/bbot/core/event/base.py
@@ -605,7 +605,7 @@ def parent(self, parent):
                 self.web_spider_distance = getattr(parent, "web_spider_distance", 0)
                 event_has_url = getattr(self, "parsed_url", None) is not None
                 for t in parent.tags:
-                    if t in ("affiliate",):
+                    if t in ("affiliate", "from-wayback"):
                         self.add_tag(t)
                     elif t.startswith("mutation-"):
                         self.add_tag(t)
@@ -1050,6 +1050,19 @@ def _data_load(self, data):
 
 
 class DictHostEvent(DictEvent):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # inherit archive_url from parent for provenance tracking (e.g. wayback archived content)
+        if isinstance(self.data, dict) and "archive_url" not in self.data:
+            parent = self.parent
+            if (
+                parent is not None
+                and parent is not self
+                and isinstance(parent.data, dict)
+                and "archive_url" in parent.data
+            ):
+                self.data["archive_url"] = parent.data["archive_url"]
+
     def _host(self):
         if isinstance(self.data, dict) and "host" in self.data:
             return make_ip_type(self.data["host"])
@@ -1576,6 +1589,7 @@ class _data_validator(BaseModel):
         confidence: str
         url: Optional[str] = None
         path: Optional[str] = None
+        archive_url: Optional[str] = None
         cves: Optional[list[str]] = None
         _validate_url = field_validator("url")(validators.validate_url)
         _validate_host = field_validator("host")(validators.validate_host)

diff --git a/bbot/core/helpers/helper.py b/bbot/core/helpers/helper.py
@@ -1,4 +1,6 @@
 import os
+import sys
+import asyncio
 import logging
 from pathlib import Path
 import multiprocessing as mp
@@ -81,7 +83,12 @@ def __init__(self, preset):
         # we spawn 1 fewer processes than cores
         # this helps to avoid locking up the system or competing with the main python process for cpu time
         num_processes = max(1, mp.cpu_count() - 1)
-        self.process_pool = ProcessPoolExecutor(max_workers=num_processes)
+        pool_kwargs = {"max_workers": num_processes}
+        # max_tasks_per_child replaces workers after N tasks, preventing memory leaks
+        # and reducing the chance of a degraded worker process causing hangs
+        if sys.version_info >= (3, 11):
+            pool_kwargs["max_tasks_per_child"] = 25
+        self.process_pool = ProcessPoolExecutor(**pool_kwargs)
 
         self._cloud = None
 
@@ -198,17 +205,24 @@ def run_in_executor(self, callback, *args, **kwargs):
         callback = partial(callback, **kwargs)
         return self.loop.run_in_executor(None, callback, *args)
 
-    def run_in_executor_mp(self, callback, *args, **kwargs):
+    async def run_in_executor_mp(self, callback, *args, **kwargs):
         """
-        Same as run_in_executor() except with a process pool executor
-        Use only in cases where callback is CPU-bound
+        Same as run_in_executor() except with a process pool executor.
+        Use only in cases where callback is CPU-bound.
+
+        Includes a timeout (default 300s) to prevent indefinite hangs if a
+        child process dies or the pool enters a broken state.
+
+        Pass ``_timeout=seconds`` to override the default timeout.
 
         Examples:
             Execute callback:
             >>> result = await self.helpers.run_in_executor_mp(callback_fn, arg1, arg2)
         """
+        timeout = kwargs.pop("_timeout", 300)
         callback = partial(callback, **kwargs)
-        return self.loop.run_in_executor(self.process_pool, callback, *args)
+        future = self.loop.run_in_executor(self.process_pool, callback, *args)
+        return await asyncio.wait_for(future, timeout=timeout)
 
     @property
     def in_tests(self):

diff --git a/bbot/core/helpers/misc.py b/bbot/core/helpers/misc.py
@@ -2621,6 +2621,25 @@ async def as_completed(coros):
             yield task
 
 
+def get_waf_strings():
+    """
+    Returns a list of common WAF (Web Application Firewall) detection strings.
+
+    Returns:
+        list: List of WAF detection strings
+
+    Examples:
+        >>> waf_strings = get_waf_strings()
+        >>> "The requested URL was rejected" in waf_strings
+        True
+    """
+    return [
+        "The requested URL was rejected",
+        "This content has been blocked",
+        "You don't have permission to access ",
+    ]
+
+
 def clean_dns_record(record):
     """
     Cleans and formats a given DNS record for further processing.

diff --git a/bbot/core/helpers/web/engine.py b/bbot/core/helpers/web/engine.py
@@ -197,6 +197,11 @@ async def _acatch(self, url, raise_error):
                 raise
             else:
                 log.debug(f"HTTP connect failed to URL: {url}")
+        except httpx.ReadError as e:
+            if raise_error:
+                raise
+            else:
+                log.verbose(f"HTTP read error for URL: {url}: {e}")
         except httpx.HTTPError as e:
             if raise_error:
                 raise

diff --git a/bbot/defaults.yml b/bbot/defaults.yml
@@ -257,6 +257,7 @@ parameter_blacklist:
   - ASP.NET_SessionId
   - PHPSESSID
   - __cf_bm
+  - _cfuvid
   - f5_cspm
 
 parameter_blacklist_prefixes:

diff --git a/bbot/modules/bypass403.py b/bbot/modules/bypass403.py
@@ -63,8 +63,6 @@
     "X-Host": "127.0.0.1",
 }
 
-# This is planned to be replaced in the future: https://github.com/blacklanternsecurity/bbot/issues/1068
-waf_strings = ["The requested URL was rejected"]
 
 for qp in query_payloads:
     signatures.append(("GET", "{scheme}://{netloc}/{path}%s" % qp, None, True))
@@ -107,7 +105,7 @@ async def do_checks(self, compare_helper, event, collapse_threshold):
 
             # In some cases WAFs will respond with a 200 code which causes a false positive
             if subject_response is not None:
-                for ws in waf_strings:
+                for ws in self.helpers.get_waf_strings():
                     if ws in subject_response.text:
                         self.debug("Rejecting result based on presence of WAF string")
                         return

diff --git a/bbot/modules/httpx.py b/bbot/modules/httpx.py
@@ -183,6 +183,13 @@ async def handle_batch(self, *events):
                 self.debug(f'Discarding 404 from "{url}"')
                 continue
 
+            # discard 4xx responses that contain WAF strings
+            if 400 <= status_code < 500:
+                body = j.get("body", "")
+                if any(ws in body for ws in self.helpers.get_waf_strings()):
+                    self.debug(f'Discarding WAF {status_code} from "{url}"')
+                    continue
+
             # main URL
             tags = [f"status-{status_code}"]
             httpx_ip = j.get("host", "")

diff --git a/bbot/modules/internal/excavate.py b/bbot/modules/internal/excavate.py
@@ -366,6 +366,42 @@ def in_bl(self, value):
 
         return False
 
+    def _is_archived(self, event):
+        """Check if an event represents archived wayback content."""
+        return isinstance(event.data, dict) and "archive_url" in event.data
+
+    def _event_host(self, event):
+        """Get the effective host from an event.
+
+        For archived wayback content, data["host"] contains the original target hostname
+        (since data["url"] points to archive.org). For regular events, we use event.host.
+
+        NOTE: Regular HTTP_RESPONSE events also have data["host"], but it contains the
+        resolved IP from the httpx binary — NOT a hostname override.
+        """
+        if self._is_archived(event) and event.data.get("host"):
+            return str(event.data["host"])
+        return str(event.host)
+
+    def _event_base_url(self, event):
+        """Get the effective base URL from an event.
+
+        For archived wayback content, reconstructs the original URL from override fields
+        (host/scheme/port/path) since parsed_url points to archive.org.
+        For regular events, returns event.parsed_url directly.
+        """
+        if not self._is_archived(event):
+            return event.parsed_url
+        scheme = event.data.get("scheme", event.parsed_url.scheme)
+        host = self._event_host(event)
+        port = event.data.get("port")
+        if port is not None:
+            port = int(port)
+            if not ((scheme == "http" and port == 80) or (scheme == "https" and port == 443)):
+                host = f"{host}:{port}"
+        path = event.data.get("path", event.parsed_url.path)
+        return urlparse(f"{scheme}://{host}{path}")
+
     def url_unparse(self, param_type, parsed_url):
         # Reconstructs a URL, optionally omitting the query string based on remove_querystring configuration value.
         if param_type == "GETPARAM":
@@ -638,8 +674,9 @@ async def process(self, yara_results, event, yara_rule_settings, discovery_conte
 
                         # The endpoint is usually a form action - we should use it if we have it. If not, default to URL.
                         else:
-                            # Use the original URL as the base and resolve the endpoint correctly in case of relative paths
-                            base_url = f"{event.parsed_url.scheme}://{event.parsed_url.netloc}{event.parsed_url.path}"
+                            # Use the effective base URL (which may differ from parsed_url for archived content)
+                            event_base = self.excavate._event_base_url(event)
+                            base_url = f"{event_base.scheme}://{event_base.netloc}{event_base.path}"
                             if not self.excavate.remove_querystring and len(event.parsed_url.query) > 0:
                                 base_url += f"?{event.parsed_url.query}"
                             url = urljoin(base_url, endpoint)
@@ -970,6 +1007,34 @@ async def process(self, yara_results, event, yara_rule_settings, discovery_conte
             if yara_results:
                 event.add_tag("login-page")
 
+    class DirectoryListingExtractor(ExcavateRule):
+        description = "Detects directory listing pages from web servers."
+        signatures = {
+            "Apache_Nginx": '"<title>Index of /"',
+            "IIS": '"[To Parent Directory]"',
+            "Python_HTTP_Server": '"<h1>Directory listing for"',
+            "Generic_Directory_Listing": '"<title>Directory Listing"',
+        }
+        yara_rules = {}
+
+        def __init__(self, excavate):
+            super().__init__(excavate)
+            signature_component_list = []
+            for signature_name, signature in self.signatures.items():
+                signature_component_list.append(rf"${signature_name} = {signature}")
+            signature_component = " ".join(signature_component_list)
+            self.yara_rules["directory_listing"] = (
+                f'rule directory_listing {{meta: description = "contains a directory listing" strings: {signature_component} condition: any of them}}'
+            )
+
+        async def process(self, yara_results, event, yara_rule_settings, discovery_context):
+            for identifier in yara_results.keys():
+                for findings in yara_results[identifier]:
+                    event_data = {
+                        "description": f"{discovery_context} {yara_rule_settings.description} ({identifier})"
+                    }
+                    await self.report(event_data, event, yara_rule_settings, discovery_context, event_type="FINDING")
+
     def add_yara_rule(self, rule_name, rule_content, rule_instance):
         rule_instance.name = rule_name
         self.yara_rules_dict[rule_name] = rule_content
@@ -997,12 +1062,13 @@ async def emit_custom_parameters(self, event, config_key, param_type, descriptio
         # Emits WEB_PARAMETER events for custom headers and cookies from the configuration.
         custom_params = self.scan.web_config.get(config_key, {})
         for param_name, param_value in custom_params.items():
+            event_base = self._event_base_url(event)
             await self.emit_web_parameter(
-                host=event.parsed_url.hostname,
+                host=self._event_host(event),
                 param_type=param_type,
                 name=param_name,
                 original_value=param_value,
-                url=self.url_unparse(param_type, event.parsed_url),
+                url=self.url_unparse(param_type, event_base),
                 description=f"HTTP Extracted Parameter [{param_name}] ({description_suffix})",
                 additional_params=_exclude_key(custom_params, param_name),
                 event=event,
@@ -1116,15 +1182,15 @@ async def search(self, data, event, content_type, discovery_context="HTTP respon
                     if results:
                         for parameter_name, original_value in results:
                             await self.emit_web_parameter(
-                                host=str(event.host),
+                                host=self._event_host(event),
                                 param_type="SPECULATIVE",
                                 name=parameter_name,
                                 original_value=original_value,
-                                url=str(event.data["url"]),
+                                url=self._event_base_url(event).geturl(),
                                 description=f"HTTP Extracted Parameter (speculative from {source_type} content) [{parameter_name}]",
                                 additional_params={},
                                 event=event,
-                                context=f"excavate's Parameter extractor found a speculative WEB_PARAMETER: {parameter_name} by parsing {source_type} data from {str(event.host)}",
+                                context=f"excavate's Parameter extractor found a speculative WEB_PARAMETER: {parameter_name} by parsing {source_type} data from {self._event_host(event)}",
                             )
                     return
 
@@ -1176,7 +1242,7 @@ async def handle_event(self, event, **kwargs):
                     ) in extract_params_url(event.parsed_url):
                         if self.in_bl(parameter_name) is False:
                             await self.emit_web_parameter(
-                                host=parsed_url.hostname,
+                                host=self._event_host(event),
                                 param_type="GETPARAM",
                                 name=parameter_name,
                                 original_value=original_value,
@@ -1210,12 +1276,13 @@ async def handle_event(self, event, **kwargs):
 
                             if self.in_bl(cookie_name) is False:
                                 self.assigned_cookies[cookie_name] = cookie_value
+                                event_base = self._event_base_url(event)
                                 await self.emit_web_parameter(
-                                    host=str(event.host),
+                                    host=self._event_host(event),
                                     param_type="COOKIE",
                                     name=cookie_name,
                                     original_value=cookie_value,
-                                    url=self.url_unparse("COOKIE", event.parsed_url),
+                                    url=self.url_unparse("COOKIE", event_base),
                                     description=f"Set-Cookie Assigned Cookie [{cookie_name}]",
                                     additional_params={},
                                     event=event,
@@ -1252,10 +1319,10 @@ async def handle_event(self, event, **kwargs):
                                     original_value,
                                     regex_name,
                                     additional_params,
-                                ) in extract_params_location(header_value, event.parsed_url):
+                                ) in extract_params_location(header_value, self._event_base_url(event)):
                                     if self.in_bl(parameter_name) is False:
                                         await self.emit_web_parameter(
-                                            host=parsed_url.hostname,
+                                            host=self._event_host(event),
                                             param_type="GETPARAM",
                                             name=parameter_name,
                                             original_value=original_value,

diff --git a/bbot/modules/templates/subdomain_enum.py b/bbot/modules/templates/subdomain_enum.py
@@ -171,8 +171,10 @@ async def filter_event(self, event):
         # reject if it's a cloud resource and not in our target (unless it's a seed event)
         if is_cloud and not self.scan.in_target(event) and "seed" not in event.tags:
             return False, "Event is a cloud resource and not a direct target"
+        # don't reject targets — if the user explicitly targeted a domain, always process it
+        is_target = self.scan.in_target(event)
         # optionally reject events with wildcards / errors
-        if self.reject_wildcards:
+        if self.reject_wildcards and not is_target:
             if any(t in event.tags for t in ("a-error", "aaaa-error")):
                 return False, "Event has a DNS resolution error"
             if self.reject_wildcards == "strict":
-Original file line number
+Diff line change
@@ Expand Up / @@ -257,6 +257,7 @@ parameter_blacklist: @@
       - ASP.NET_SessionId
       - PHPSESSID
       - __cf_bm
+      - _cfuvid
       - f5_cspm
     parameter_blacklist_prefixes:
@@ Expand Down @@