bbot/bbot/modules/httpx.py at 8adc83ff15b2bd10730aebe98a1d11ae43a889eb · blacklanternsecurity/bbot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import re
import orjson
import tempfile
import subprocess
from pathlib import Path
from http.cookies import SimpleCookie

from bbot.modules.base import BaseModule


class httpx(BaseModule):
    watched_events = ["OPEN_TCP_PORT", "URL_UNVERIFIED", "URL"]
    produced_events = ["URL", "HTTP_RESPONSE"]
    flags = ["active", "safe", "web-basic", "social-enum", "subdomain-enum", "cloud-enum"]
    meta = {
        "description": "Visit webpages. Many other modules rely on httpx",
        "created_date": "2022-07-08",
        "author": "@TheTechromancer",
    }

    options = {
        "threads": 50,
        "in_scope_only": True,
        "version": "1.2.5",
        "max_response_size": 5242880,
        "store_responses": False,
        "probe_all_ips": False,
    }
    options_desc = {
        "threads": "Number of httpx threads to use",
        "in_scope_only": "Only visit web reparents that are in scope.",
        "version": "httpx version",
        "max_response_size": "Max response size in bytes",
        "store_responses": "Save raw HTTP responses to scan folder",
        "probe_all_ips": "Probe all the ips associated with same host",
    }
    deps_ansible = [
        {
            "name": "Download httpx",
            "unarchive": {
                "src": "https://github.com/projectdiscovery/httpx/releases/download/v#{BBOT_MODULES_HTTPX_VERSION}/httpx_#{BBOT_MODULES_HTTPX_VERSION}_#{BBOT_OS}_#{BBOT_CPU_ARCH_GOLANG}.zip",
                "include": "httpx",
                "dest": "#{BBOT_TOOLS}",
                "remote_src": True,
            },
        }
    ]

    scope_distance_modifier = 2
    _shuffle_incoming_queue = False
    _batch_size = 500
    _priority = 2
    # accept Javascript URLs
    accept_url_special = True

    async def setup(self):
        self.threads = self.config.get("threads", 50)
        self.max_response_size = self.config.get("max_response_size", 5242880)
        self.store_responses = self.config.get("store_responses", False)
        self.probe_all_ips = self.config.get("probe_all_ips", False)
        self.httpx_tempdir_regex = re.compile(r"^httpx\d+$")
        return True

    async def filter_event(self, event):
        if "_wildcard" in str(event.host).split("."):
            return False, "event is wildcard"

        if "unresolved" in event.tags:
            return False, "event is unresolved"

        if event.module == self:
            return False, "event is from self"

        if "spider-max" in event.tags:
            return False, "event exceeds spidering limits"

        # scope filtering
        in_scope_only = self.config.get("in_scope_only", True)
        if "httpx-safe" in event.tags:
            return True
        max_scope_distance = 0 if in_scope_only else (self.scan.scope_search_distance + 1)
        if event.scope_distance > max_scope_distance:
            return False, "event is not in scope"
        return True

    def make_url_metadata(self, event):
        has_spider_max = "spider-max" in event.tags
        url_hash = None
        if event.type.startswith("URL"):
            # we NEED the port, otherwise httpx will try HTTPS even for HTTP URLs
            url = event.with_port().geturl()
            if event.parsed_url.path == "/":
                url_hash = hash((event.host, event.port, has_spider_max))
        else:
            url = str(event.data)
            url_hash = hash((event.host, event.port, has_spider_max))
        if url_hash is None:
            url_hash = hash((url, has_spider_max))
        return url, url_hash

    def _incoming_dedup_hash(self, event):
        url, url_hash = self.make_url_metadata(event)
        return url_hash

    async def handle_batch(self, *events):
        stdin = {}

        for event in events:
            url, url_hash = self.make_url_metadata(event)
            stdin[url] = event

        if not stdin:
            return

        command = [
            "httpx",
            "-silent",
            "-json",
            "-include-response",
            "-threads",
            self.threads,
            "-timeout",
            self.scan.httpx_timeout,
            "-retries",
            self.scan.httpx_retries,
            "-header",
            f"User-Agent: {self.scan.useragent}",
            "-response-size-to-read",
            f"{self.max_response_size}",
        ]

        if self.store_responses:
            response_dir = self.scan.home / "httpx"
            self.helpers.mkdir(response_dir)
            command += ["-srd", str(response_dir)]

        dns_resolvers = ",".join(self.helpers.system_resolvers)
        if dns_resolvers:
            command += ["-r", dns_resolvers]

        if self.probe_all_ips:
            command += ["-probe-all-ips"]

        # Add custom HTTP headers
        for hk, hv in self.scan.custom_http_headers.items():
            command += ["-header", f"{hk}: {hv}"]

        # Add custom HTTP cookies as a single header
        if self.scan.custom_http_cookies:
            cookie = SimpleCookie()
            for ck, cv in self.scan.custom_http_cookies.items():
                cookie[ck] = cv

            # Build the cookie header
            cookie_header = f"Cookie: {cookie.output(header='', sep='; ').strip()}"
            command += ["-header", cookie_header]

        proxy = self.scan.http_proxy
        if proxy:
            command += ["-http-proxy", proxy]
        async for line in self.run_process_live(command, text=False, input=list(stdin), stderr=subprocess.DEVNULL):
            try:
                j = await self.helpers.run_in_executor(orjson.loads, line)
            except orjson.JSONDecodeError:
                self.warning(f"httpx failed to decode line: {line}")
                continue

            url = j.get("url", "")
            status_code = int(j.get("status_code", 0))
            if status_code == 0:
                self.debug(f'No HTTP status code for "{url}"')
                continue

            parent_event = stdin.get(j.get("input", ""), None)

            if parent_event is None:
                self.warning(f"Unable to correlate parent event from: {line}")
                continue

            # discard 404s from unverified URLs
            path = j.get("path", "/")
            if parent_event.type == "URL_UNVERIFIED" and status_code in (404,) and path != "/":
                self.debug(f'Discarding 404 from "{url}"')
                continue

            # discard 4xx responses that contain WAF strings
            if 400 <= status_code < 500:
                body = j.get("body", "")
                if any(ws in body for ws in self.helpers.get_waf_strings()):
                    self.debug(f'Discarding WAF {status_code} from "{url}"')
                    continue

            # main URL
            tags = [f"status-{status_code}"]
            httpx_ip = j.get("host", "")
            if httpx_ip:
                tags.append(f"ip-{httpx_ip}")
            # grab title
            title = self.helpers.tagify(j.get("title", ""), maxlen=30)
            if title:
                tags.append(f"http-title-{title}")

            url_context = "{module} visited {event.parent.data} and got status code {event.http_status}"
            if parent_event.type == "OPEN_TCP_PORT":
                url_context += " at {event.data}"

            url_event = self.make_event(
                url,
                "URL",
                parent_event,
                tags=tags,
                context=url_context,
            )
            if url_event:
                if url_event != parent_event:
                    await self.emit_event(url_event)
                # HTTP response
                content_type = j.get("header", {}).get("content_type", "unspecified").split(";")[0]
                content_length = j.get("content_length", 0)
                content_length = self.helpers.bytes_to_human(content_length)
                await self.emit_event(
                    j,
                    "HTTP_RESPONSE",
                    url_event,
                    tags=url_event.tags,
                    context=f"HTTP_RESPONSE was {content_length} with {content_type} content type",
                )

        for tempdir in Path(tempfile.gettempdir()).iterdir():
            if tempdir.is_dir() and self.httpx_tempdir_regex.match(tempdir.name):
                self.helpers.rm_rf(tempdir)

    async def cleanup(self):
        resume_file = self.helpers.current_dir / "resume.cfg"
        resume_file.unlink(missing_ok=True)