freelawproject · Luis-manzur · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/CHANGES.md b/CHANGES.md
@@ -41,8 +41,10 @@ Features:
 
 Changes:
 - Abstract validation functions out of `AbstractSite.download_content` into reusable utils #1882
+- Abstract urllib `download_content` into `AbstractSite` for scrapers with `use_urllib = True` #1714
 
 Fixes:
+- Fix `masssuperct` by switching from JSON API to HTML scraping with urllib to bypass Cloudflare TLS fingerprinting #1714
 - Fix `lactapp_3` opinion download by using urllib instead of httpx to bypass Cloudflare TLS fingerprinting #1882
 - Fix `mich` scraper failing when API returns null courts #1885
 

diff --git a/juriscraper/AbstractSite.py b/juriscraper/AbstractSite.py
@@ -391,6 +391,22 @@ async def _download(self, request_dict=None):
         self._post_process_response()
         return self._return_response_text_object()
 
+    def _download_content_urllib(self, download_url: str, headers: dict):
+        """Download content using urllib to bypass Cloudflare
+
+        Uses urllib instead of httpx because Cloudflare blocks httpx
+        via TLS fingerprinting. Used by scrapers with `use_urllib = True`.
+
+        :param download_url: The URL for the item you wish to download.
+        :param headers: headers dict
+        :return: A response object with a `content` field
+        """
+        req = urllib.request.Request(download_url, headers=headers)
+        response = self.urllib_opener.open(req, timeout=90)
+        response.content = response.read()
+
+        return response
+
     async def download_content(
         self,
         download_url: str,
@@ -437,26 +453,28 @@ def handler(request: httpx.Request):
             r = await s.get(url=self.url)
             return self.cleanup_content(r.content)
 
-        s = self.request["session"]
-
         if self.needs_special_headers:
             headers = self.request["headers"]
         else:
             headers = {"User-Agent": "CourtListener"}
 
-        # Note that we do a GET even if self.method is POST. This is
-        # deliberate.
-        r = await s.get(
-            download_url,
-            headers=headers,
-            cookies=self.cookies,
-            timeout=300,
-        )
+        if self.use_urllib:
+            r = self._download_content_urllib(download_url, headers)
+        else:
+            s = self.request["session"]
+            # Note that we do a GET even if self.method is POST. This is
+            # deliberate.
+            r = await s.get(
+                download_url,
+                headers=headers,
+                cookies=self.cookies,
+                timeout=300,
+            )
 
         check_empty_downloaded_file(r, download_url)
         check_expected_content_types(self, r, download_url)
 
-        if doctor_is_available:
+        if doctor_is_available and not self.use_urllib:
             # test for and follow meta redirects, uses doctor get_extension
             # service
             r = await follow_redirections(r, s)
@@ -489,15 +507,18 @@ def _download_urllib(self):
         data = None
         if self.method == "POST":
             data = urllib.parse.urlencode(self.parameters).encode("utf-8")
+
         raw = self._urllib_fetch(self.url, data=data)
         text = raw.decode("utf-8")
+
         content_type = ""
         if hasattr(self.request["response"], "getheader"):
             content_type = self.request["response"].getheader(
                 "Content-Type", ""
             )
         if "json" in content_type:
             return json.loads(text)
+
         text = self._clean_text(text)
         html_tree = self._make_html_tree(text)
         return html_tree

diff --git a/juriscraper/opinions/united_states/state/lactapp_3.py b/juriscraper/opinions/united_states/state/lactapp_3.py
@@ -14,7 +14,6 @@
 
 import re
 import urllib.parse
-import urllib.request
 from datetime import date, datetime
 from urllib.parse import urljoin
 
@@ -23,11 +22,6 @@
 from juriscraper.lib.date_utils import unique_year_month
 from juriscraper.lib.log_tools import make_default_logger
 from juriscraper.lib.string_utils import titlecase
-from juriscraper.lib.utils import (
-    check_download_url,
-    check_empty_downloaded_file,
-    check_expected_content_types,
-)
 from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
 logger = make_default_logger()
@@ -199,37 +193,3 @@ def make_backscrape_iterable(self, kwargs):
         self.back_scrape_iterable = unique_year_month(
             self.back_scrape_iterable
         )
-
-    async def download_content(
-        self,
-        download_url: str,
-        doctor_is_available: bool = False,
-        media_root: str = "",
-    ) -> bytes:
-        """Download opinion content using urllib to bypass Cloudflare
-
-        Uses urllib instead of httpx because Cloudflare blocks httpx
-        via TLS fingerprinting.
-
-        Note that we don't need `media_root` or `doctor_is_available`
-        since this won't be used in CL testing and we won't follow
-        redirection due to the content being PDF
-        """
-
-        # the test_mode_is_enabled() conditional is not implemented
-        # because it is only used for integration tests with CL
-        # and won't touch this child scraper. Copying the code would
-        # just introduce boilerplate
-
-        check_download_url(download_url)
-
-        headers = {"User-Agent": "CourtListener"}
-        req = urllib.request.Request(download_url, headers=headers)
-        response = self.urllib_opener.open(req, timeout=90)
-        pdf_content = response.read()
-
-        check_empty_downloaded_file(pdf_content, download_url)
-        check_expected_content_types(self, response, download_url)
-
-        # cleanup_content is not implemented, just for compatibility
-        return self.cleanup_content(pdf_content)
diff --git a/juriscraper/opinions/united_states/state/masssuperct.py b/juriscraper/opinions/united_states/state/masssuperct.py
@@ -7,105 +7,140 @@
 Date: 2025-07-16
 History:
     - Created by luism
+    - 2026-03-25: Switched from JSON API to HTML page scraping
+Notes:
+    Cloudflare blocks GET requests via TLS fingerprinting.
+    We use POST with an empty body to bypass this.
 """
 
-import re
 from datetime import date, datetime
-from urllib.parse import urljoin
+from urllib.parse import quote, urljoin
 
 from lxml import etree, html
 
 from juriscraper.lib.date_utils import unique_year_month
+from juriscraper.lib.exceptions import InvalidDocumentError
 from juriscraper.lib.html_utils import strip_bad_html_tags_insecure
+from juriscraper.lib.log_tools import make_default_logger
 from juriscraper.lib.string_utils import titlecase
 from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
+logger = make_default_logger()
+
 
 class Site(OpinionSiteLinear):
     court_name = "Superior Court"
     first_opinion_date = datetime(2017, 6, 20)
+    use_urllib = True
+    base_url = "https://www.socialaw.com/services/slip-opinions/"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.url = "https://www.socialaw.com/customapi/slips/getopinions"
         self.court_id = self.__module__
         self.search_date = datetime.today()
-        self.parameters = {
-            "SectionName": self.court_name,
-            "ArchiveDate": self.search_date.strftime("%B %Y"),
-        }
+        self.url = self._build_url()
         self.method = "POST"
+        self.parameters = {}
         self.status = "Published"
         self.expected_content_types = ["text/html"]
         self.days_interval = 30
         self.make_backscrape_iterable(kwargs)
 
+    def _build_url(self) -> str:
+        """Build the listing URL with court and month query parameters.
+
+        :return: Full URL with encoded query parameters
+        """
+        month_str = quote(self.search_date.strftime("%B %Y"))
+        court_str = quote(self.court_name)
+        return f"{self.base_url}?Court={court_str}&Month={month_str}"
+
     def _process_html(self):
-        """Scrape and process the JSON endpoint
+        """Parse opinion listing from HTML accordion items.
 
         :return: None
         """
-        for row in self.html:
-            url = urljoin(
-                "https://www.socialaw.com/services/slip-opinions/",
-                row["UrlName"],
+        for item in self.html.xpath(
+            "//div[contains(@class, 'slip-opinions-list')]"
+            "//div[@class='accordion-item']"
+        ):
+            name = item.xpath(".//strong[contains(@class, 'title')]//text()")
+            name = name[0].strip() if name else ""
+
+            date_str = item.xpath(
+                ".//div[contains(@class, 'dates-section')]"
+                "//div[@class='rich-text rich-text-sm']//text()"
             )
-            details = row["Details"]
-            caption = titlecase(row.get("Parties"))
-            caption = re.sub(r"(\[\d{1,2}\])", "", caption)
-
-            judge_str = details.get("Present", "")
-            judge_str = re.sub(r"(\[\d{1,2}\])", "", judge_str)
-            judge_str = re.sub(r"\, JJ\.", "", judge_str)
-            judge_str = re.sub(
-                r"(Associate\s+)?Justice*|of the Superior Court", "", judge_str
+            date_str = date_str[0].strip() if date_str else ""
+
+            docket = item.xpath(
+                ".//div[contains(@class, 'docket-section')]"
+                "//div[@class='section-header']"
+                "//div[@class='rich-text rich-text-sm']//text()"
+            )
+            docket = docket[0].strip() if docket else ""
+
+            url = item.xpath(
+                ".//div[contains(@class, 'docket-section')]"
+                "//a[contains(@class, 'btn')]/@href"
             )
+            url = urljoin("https://www.socialaw.com", url[0]) if url else ""
 
-            # Clear judge_str if it matches a date like 'July 16, 2024'
-            if re.match(r"^[A-Za-z]+\s+\d{1,2},\s+\d{4}$", judge_str.strip()):
-                judge_str = ""
+            if not name or not url:
+                logger.warning(
+                    "masssuperct: missing name or URL for docket '%s', skipping",
+                    docket,
+                )
+                continue
 
             self.cases.append(
                 {
-                    "name": caption,
-                    "judge": judge_str,
-                    "date": row["Date"],
+                    "name": titlecase(name),
+                    "date": date_str,
                     "url": url,
-                    "docket": details["Docket"],
+                    "docket": docket,
                 }
             )
 
     @staticmethod
     def cleanup_content(content):
         """Remove non-opinion HTML
 
-        Cleanup HMTL from Social Law page so we can properly display the content
+        Cleanup HTML from Social Law page so we can properly display
+        the content.
 
         :param content: The scraped HTML
         :return: Cleaner HTML
         """
         content = content.decode("utf-8")
         tree = strip_bad_html_tags_insecure(content, remove_scripts=True)
         content = tree.xpath(
-            "//div[@id='contentPlaceholder_ctl00_ctl00_ctl00_detailContainer']"
-        )[0]
+            "//div[contains(@class, 'primary-content-rich-text')]"
+        )
+
+        if not content:
+            content = tree.xpath(
+                "//div[contains(@class, 'primary-content-body')]"
+            )
+        if not content:
+            raise InvalidDocumentError(
+                "masssuperct: no opinion content found in page"
+            )
+
         new_tree = etree.Element("html")
         body = etree.SubElement(new_tree, "body")
-        body.append(content)
-        return html.tostring(new_tree).decode("utf-8")
+        body.append(content[0])
+        return html.tostring(new_tree)
 
     async def _download_backwards(self, search_date: date) -> None:
         """Download and process HTML for a given target date.
 
-        :param search_date (date): The date for which to download and process opinions.
-        :return None; sets the target date, downloads the corresponding HTML
-        and processes the HTML to extract case details.
+        :param search_date: The date for which to download and process
+            opinions.
+        :return: None
         """
         self.search_date = search_date
-        self.parameters = {
-            "SectionName": self.court_name,
-            "ArchiveDate": self.search_date.strftime("%B %Y"),
-        }
+        self.url = self._build_url()
         self.html = await self._download()
         self._process_html()