Skip to content

Commit 736dbf8

Browse files
authored
chore: record when a js challenge returned (#1198)
Print out in debug information when request for PyPI pages returns a JavaScript Challenge response. Signed-off-by: Carl Flottmann <[email protected]>
1 parent 5f998e0 commit 736dbf8

File tree

2 files changed

+42
-0
lines changed

2 files changed

+42
-0
lines changed

src/macaron/slsa_analyzer/package_registry/pypi_registry.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from macaron.util import (
3030
can_download_file,
3131
download_file_with_size_limit,
32+
html_is_js_challenge,
3233
send_get_http_raw,
3334
stream_file_with_size_limit,
3435
)
@@ -321,6 +322,9 @@ def get_package_page(self, package_name: str) -> str | None:
321322
response = send_get_http_raw(url)
322323
if response:
323324
html_snippets = response.content.decode("utf-8")
325+
if html_is_js_challenge(html_snippets):
326+
logger.debug("URL returned a JavaScript Challenge: %s", url)
327+
return None
324328
return html_snippets
325329
return None
326330

@@ -362,6 +366,9 @@ def get_maintainer_profile_page(self, username: str) -> str | None:
362366
response = send_get_http_raw(url, headers=None)
363367
if response:
364368
html_snippets = response.content.decode("utf-8")
369+
if html_is_js_challenge(html_snippets):
370+
logger.debug("URL returned a JavaScript Challenge: %s", url)
371+
return None
365372
return html_snippets
366373
return None
367374

src/macaron/util.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from typing import BinaryIO
1414

1515
import requests
16+
from bs4 import BeautifulSoup
1617
from requests.models import Response
1718

1819
from macaron.config.defaults import defaults
@@ -595,3 +596,37 @@ def decode(data: bytes) -> str | None:
595596

596597
logger.debug("Failed to decode bytes using most common character encodings.")
597598
return None
599+
600+
601+
def html_is_js_challenge(html: str) -> bool:
602+
"""Check if this HTML is the JavaScript Challenge response.
603+
604+
The JavaScript Challenge is generally returned to a GET request when a CDN serves some
605+
JavaScript code to be rendered for the page. This usually means the HTML page isn't obtained
606+
when using request libraries that cannot render JavaScript, and this page is returned instead.
607+
608+
Parameters
609+
----------
610+
html: str
611+
The string HTML of the page returned by a request.
612+
613+
Returns
614+
-------
615+
bool
616+
True if the page is a JavaScript Challenge html response. False otherwise.
617+
"""
618+
# Main three components:
619+
# <html><head><title>Client Challenge
620+
# <html><body><noscript><div><div><span>Javascript is disabled in your browser
621+
# <html><body><noscript><div><div><p>Please enable JavaScript to proceed
622+
623+
soup = BeautifulSoup(html, "html.parser")
624+
title = soup.find("title")
625+
noscript_span = soup.find("span")
626+
noscript_msg = soup.find("p")
627+
628+
has_title = title is not None and "Client Challenge" in title.get_text()
629+
has_span = noscript_span is not None and "JavaScript is disabled in your browser" in noscript_span.get_text()
630+
has_msg = noscript_msg is not None and "Please enable JavaScript to proceed" in noscript_msg.get_text()
631+
632+
return has_title and has_span and has_msg

0 commit comments

Comments
 (0)