|
13 | 13 | from typing import BinaryIO |
14 | 14 |
|
15 | 15 | import requests |
| 16 | +from bs4 import BeautifulSoup |
16 | 17 | from requests.models import Response |
17 | 18 |
|
18 | 19 | from macaron.config.defaults import defaults |
@@ -595,3 +596,37 @@ def decode(data: bytes) -> str | None: |
595 | 596 |
|
596 | 597 | logger.debug("Failed to decode bytes using most common character encodings.") |
597 | 598 | return None |
| 599 | + |
| 600 | + |
| 601 | +def html_is_js_challenge(html: str) -> bool: |
| 602 | + """Check if this HTML is the JavaScript Challenge response. |
| 603 | +
|
| 604 | + The JavaScript Challenge is generally returned to a GET request when a CDN serves some |
| 605 | + JavaScript code to be rendered for the page. This usually means the HTML page isn't obtained |
| 606 | + when using request libraries that cannot render JavaScript, and this page is returned instead. |
| 607 | +
|
| 608 | + Parameters |
| 609 | + ---------- |
| 610 | + html: str |
| 611 | + The string HTML of the page returned by a request. |
| 612 | +
|
| 613 | + Returns |
| 614 | + ------- |
| 615 | + bool |
| 616 | + True if the page is a JavaScript Challenge html response. False otherwise. |
| 617 | + """ |
| 618 | + # Main three components: |
| 619 | + # <html><head><title>Client Challenge |
| 620 | + # <html><body><noscript><div><div><span>Javascript is disabled in your browser |
| 621 | + # <html><body><noscript><div><div><p>Please enable JavaScript to proceed |
| 622 | + |
| 623 | + soup = BeautifulSoup(html, "html.parser") |
| 624 | + title = soup.find("title") |
| 625 | + noscript_span = soup.find("span") |
| 626 | + noscript_msg = soup.find("p") |
| 627 | + |
| 628 | + has_title = title is not None and "Client Challenge" in title.get_text() |
| 629 | + has_span = noscript_span is not None and "JavaScript is disabled in your browser" in noscript_span.get_text() |
| 630 | + has_msg = noscript_msg is not None and "Please enable JavaScript to proceed" in noscript_msg.get_text() |
| 631 | + |
| 632 | + return has_title and has_span and has_msg |
0 commit comments