Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,10 @@ Features:

Changes:
- Abstract validation functions out of `AbstractSite.download_content` into reusable utils #1882
- Abstract urllib `download_content` into `AbstractSite` for scrapers with `use_urllib = True` #1714

Fixes:
- Fix `masssuperct` by switching from JSON API to HTML scraping with urllib to bypass Cloudflare TLS fingerprinting #1714
- Fix `lactapp_3` opinion download by using urllib instead of httpx to bypass Cloudflare TLS fingerprinting #1882
- Fix `mich` scraper failing when API returns null courts #1885

Expand Down
43 changes: 32 additions & 11 deletions juriscraper/AbstractSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,22 @@ async def _download(self, request_dict=None):
self._post_process_response()
return self._return_response_text_object()

def _download_content_urllib(self, download_url: str, headers: dict):
"""Download content using urllib to bypass Cloudflare

Uses urllib instead of httpx because Cloudflare blocks httpx
via TLS fingerprinting. Used by scrapers with `use_urllib = True`.

:param download_url: The URL for the item you wish to download.
:param headers: headers dict
:return: A response object with a `content` field
"""
req = urllib.request.Request(download_url, headers=headers)
response = self.urllib_opener.open(req, timeout=90)
response.content = response.read()

return response

async def download_content(
self,
download_url: str,
Expand Down Expand Up @@ -437,26 +453,28 @@ def handler(request: httpx.Request):
r = await s.get(url=self.url)
return self.cleanup_content(r.content)

s = self.request["session"]

if self.needs_special_headers:
headers = self.request["headers"]
else:
headers = {"User-Agent": "CourtListener"}

# Note that we do a GET even if self.method is POST. This is
# deliberate.
r = await s.get(
download_url,
headers=headers,
cookies=self.cookies,
timeout=300,
)
if self.use_urllib:
r = self._download_content_urllib(download_url, headers)
else:
s = self.request["session"]
# Note that we do a GET even if self.method is POST. This is
# deliberate.
r = await s.get(
download_url,
headers=headers,
cookies=self.cookies,
timeout=300,
)

check_empty_downloaded_file(r, download_url)
check_expected_content_types(self, r, download_url)

if doctor_is_available:
if doctor_is_available and not self.use_urllib:
# test for and follow meta redirects, uses doctor get_extension
# service
r = await follow_redirections(r, s)
Expand Down Expand Up @@ -489,15 +507,18 @@ def _download_urllib(self):
data = None
if self.method == "POST":
data = urllib.parse.urlencode(self.parameters).encode("utf-8")

raw = self._urllib_fetch(self.url, data=data)
text = raw.decode("utf-8")

content_type = ""
if hasattr(self.request["response"], "getheader"):
content_type = self.request["response"].getheader(
"Content-Type", ""
)
if "json" in content_type:
return json.loads(text)

text = self._clean_text(text)
html_tree = self._make_html_tree(text)
return html_tree
Expand Down
40 changes: 0 additions & 40 deletions juriscraper/opinions/united_states/state/lactapp_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

import re
import urllib.parse
import urllib.request
from datetime import date, datetime
from urllib.parse import urljoin

Expand All @@ -23,11 +22,6 @@
from juriscraper.lib.date_utils import unique_year_month
from juriscraper.lib.log_tools import make_default_logger
from juriscraper.lib.string_utils import titlecase
from juriscraper.lib.utils import (
check_download_url,
check_empty_downloaded_file,
check_expected_content_types,
)
from juriscraper.OpinionSiteLinear import OpinionSiteLinear

logger = make_default_logger()
Expand Down Expand Up @@ -199,37 +193,3 @@ def make_backscrape_iterable(self, kwargs):
self.back_scrape_iterable = unique_year_month(
self.back_scrape_iterable
)

async def download_content(
self,
download_url: str,
doctor_is_available: bool = False,
media_root: str = "",
) -> bytes:
"""Download opinion content using urllib to bypass Cloudflare

Uses urllib instead of httpx because Cloudflare blocks httpx
via TLS fingerprinting.

Note that we don't need `media_root` or `doctor_is_available`
since this won't be used in CL testing and we won't follow
redirection due to the content being PDF
"""

# the test_mode_is_enabled() conditional is not implemented
# because it is only used for integration tests with CL
# and won't touch this child scraper. Copying the code would
# just introduce boilerplate

check_download_url(download_url)

headers = {"User-Agent": "CourtListener"}
req = urllib.request.Request(download_url, headers=headers)
response = self.urllib_opener.open(req, timeout=90)
pdf_content = response.read()

check_empty_downloaded_file(pdf_content, download_url)
check_expected_content_types(self, response, download_url)

# cleanup_content is not implemented, just for compatibility
return self.cleanup_content(pdf_content)
115 changes: 75 additions & 40 deletions juriscraper/opinions/united_states/state/masssuperct.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,105 +7,140 @@
Date: 2025-07-16
History:
- Created by luism
- 2026-03-25: Switched from JSON API to HTML page scraping
Notes:
Cloudflare blocks GET requests via TLS fingerprinting.
We use POST with an empty body to bypass this.
"""

import re
from datetime import date, datetime
from urllib.parse import urljoin
from urllib.parse import quote, urljoin

from lxml import etree, html

from juriscraper.lib.date_utils import unique_year_month
from juriscraper.lib.exceptions import InvalidDocumentError
from juriscraper.lib.html_utils import strip_bad_html_tags_insecure
from juriscraper.lib.log_tools import make_default_logger
from juriscraper.lib.string_utils import titlecase
from juriscraper.OpinionSiteLinear import OpinionSiteLinear

logger = make_default_logger()


class Site(OpinionSiteLinear):
court_name = "Superior Court"
first_opinion_date = datetime(2017, 6, 20)
use_urllib = True
base_url = "https://www.socialaw.com/services/slip-opinions/"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "https://www.socialaw.com/customapi/slips/getopinions"
self.court_id = self.__module__
self.search_date = datetime.today()
self.parameters = {
"SectionName": self.court_name,
"ArchiveDate": self.search_date.strftime("%B %Y"),
}
self.url = self._build_url()
self.method = "POST"
self.parameters = {}
self.status = "Published"
self.expected_content_types = ["text/html"]
self.days_interval = 30
self.make_backscrape_iterable(kwargs)

def _build_url(self) -> str:
"""Build the listing URL with court and month query parameters.

:return: Full URL with encoded query parameters
"""
month_str = quote(self.search_date.strftime("%B %Y"))
court_str = quote(self.court_name)
return f"{self.base_url}?Court={court_str}&Month={month_str}"

def _process_html(self):
"""Scrape and process the JSON endpoint
"""Parse opinion listing from HTML accordion items.

:return: None
"""
for row in self.html:
url = urljoin(
"https://www.socialaw.com/services/slip-opinions/",
row["UrlName"],
for item in self.html.xpath(
"//div[contains(@class, 'slip-opinions-list')]"
"//div[@class='accordion-item']"
):
name = item.xpath(".//strong[contains(@class, 'title')]//text()")
name = name[0].strip() if name else ""

date_str = item.xpath(
".//div[contains(@class, 'dates-section')]"
"//div[@class='rich-text rich-text-sm']//text()"
)
details = row["Details"]
caption = titlecase(row.get("Parties"))
caption = re.sub(r"(\[\d{1,2}\])", "", caption)

judge_str = details.get("Present", "")
judge_str = re.sub(r"(\[\d{1,2}\])", "", judge_str)
judge_str = re.sub(r"\, JJ\.", "", judge_str)
judge_str = re.sub(
r"(Associate\s+)?Justice*|of the Superior Court", "", judge_str
date_str = date_str[0].strip() if date_str else ""

docket = item.xpath(
".//div[contains(@class, 'docket-section')]"
"//div[@class='section-header']"
"//div[@class='rich-text rich-text-sm']//text()"
)
docket = docket[0].strip() if docket else ""

url = item.xpath(
".//div[contains(@class, 'docket-section')]"
"//a[contains(@class, 'btn')]/@href"
)
url = urljoin("https://www.socialaw.com", url[0]) if url else ""

# Clear judge_str if it matches a date like 'July 16, 2024'
if re.match(r"^[A-Za-z]+\s+\d{1,2},\s+\d{4}$", judge_str.strip()):
judge_str = ""
if not name or not url:
logger.warning(
"masssuperct: missing name or URL for docket '%s', skipping",
docket,
)
continue

self.cases.append(
{
"name": caption,
"judge": judge_str,
"date": row["Date"],
"name": titlecase(name),
"date": date_str,
"url": url,
"docket": details["Docket"],
"docket": docket,
}
)

@staticmethod
def cleanup_content(content):
"""Remove non-opinion HTML

Cleanup HMTL from Social Law page so we can properly display the content
Cleanup HTML from Social Law page so we can properly display
the content.

:param content: The scraped HTML
:return: Cleaner HTML
"""
content = content.decode("utf-8")
tree = strip_bad_html_tags_insecure(content, remove_scripts=True)
content = tree.xpath(
"//div[@id='contentPlaceholder_ctl00_ctl00_ctl00_detailContainer']"
)[0]
"//div[contains(@class, 'primary-content-rich-text')]"
)

if not content:
content = tree.xpath(
"//div[contains(@class, 'primary-content-body')]"
)
if not content:
raise InvalidDocumentError(
"masssuperct: no opinion content found in page"
)

new_tree = etree.Element("html")
body = etree.SubElement(new_tree, "body")
body.append(content)
return html.tostring(new_tree).decode("utf-8")
body.append(content[0])
return html.tostring(new_tree)

async def _download_backwards(self, search_date: date) -> None:
"""Download and process HTML for a given target date.

:param search_date (date): The date for which to download and process opinions.
:return None; sets the target date, downloads the corresponding HTML
and processes the HTML to extract case details.
:param search_date: The date for which to download and process
opinions.
:return: None
"""
self.search_date = search_date
self.parameters = {
"SectionName": self.court_name,
"ArchiveDate": self.search_date.strftime("%B %Y"),
}
self.url = self._build_url()
self.html = await self._download()
self._process_html()

Expand Down
Loading
Loading