From a12e2eed2d23cf9e4046cf2399956a9e8d02108d Mon Sep 17 00:00:00 2001 From: luism Date: Thu, 2 Oct 2025 15:48:09 -0400 Subject: [PATCH 1/5] feat(AbstractSite): add residential proxy support for IP refresh and avoid blockage --- juriscraper/AbstractSite.py | 94 +++++++++++++------ .../opinions/united_states/state/ark.py | 10 +- .../opinions/united_states/state/okla.py | 1 + 3 files changed, 73 insertions(+), 32 deletions(-) diff --git a/juriscraper/AbstractSite.py b/juriscraper/AbstractSite.py index 7f5fb12dd..5150538b1 100644 --- a/juriscraper/AbstractSite.py +++ b/juriscraper/AbstractSite.py @@ -50,6 +50,7 @@ def __init__(self, cnt=None, **kwargs): super().__init__() # Computed metadata + self.additional_params = None self.hash = None self.html = None self.method = "GET" @@ -72,6 +73,9 @@ def __init__(self, cnt=None, **kwargs): "status": None, "url": None, } + self.use_proxy = False + self.SCRAPINGBEE_API_KEY = os.environ.get("SCRAPINGBEE_API_KEY", None) + self.SCRAPINGBEE_API_URL = os.environ.get("SCRAPINGBEE_API_URL", None) # Attribute to reference a function passed by the caller, # which takes a single argument, the Site object, after @@ -385,10 +389,10 @@ def _download(self, request_dict=None): if self.test_mode_enabled(): self._request_url_mock(self.url) - elif self.method == "GET": - self._request_url_get(self.url) - elif self.method == "POST": - self._request_url_post(self.url) + elif self.use_proxy: + self._request_url_via_proxy(self.url) + else: + self._request_url(self.url) self._post_process_response() return self._return_response_text_object() @@ -434,13 +438,28 @@ def download_content( # Note that we do a GET even if self.method is POST. This is # deliberate. - r = s.get( - download_url, - verify=has_cipher, # WA has a certificate we don't understand - headers=headers, - cookies=self.cookies, - timeout=300, - ) + if self.use_proxy: + params = { + "api_key": self.SCRAPINGBEE_API_KEY, + "url": download_url, + "render_js": "false", + "cookies": self.cookies, + "country_code": "us", + 'premium_proxy': 'true', + } + + r = s.get( + self.SCRAPINGBEE_API_URL, + params=params, + ) + else: + r = s.get( + download_url, + verify=has_cipher, # WA has a certificate we don't understand + headers=headers, + cookies=self.cookies, + timeout=300, + ) # test for empty files (thank you CA1) if len(r.content) == 0: @@ -494,32 +513,51 @@ def _process_request_parameters(self, parameters=None): del parameters["verify"] self.request["parameters"].update(parameters) - def _request_url_get(self, url): - """Execute GET request and assign appropriate request dictionary - values - """ + def _request_url(self, url): + """Execute GET or POST request and assign appropriate request dictionary values""" self.request["url"] = url - self.request["response"] = self.request["session"].get( - url, + session = self.request["session"] + request_args = dict( + url=url, headers=self.request["headers"], verify=self.request["verify"], timeout=60, **self.request["parameters"], ) + if self.method == "POST": + request_args["data"] = self.parameters + self.request["response"] = session.post(**request_args) + else: + self.request["response"] = session.get(**request_args) if self.save_response: self.save_response(self) - def _request_url_post(self, url): - """Execute POST request and assign appropriate request dictionary values""" - self.request["url"] = url - self.request["response"] = self.request["session"].post( - url, - headers=self.request["headers"], - verify=self.request["verify"], - data=self.parameters, - timeout=60, - **self.request["parameters"], - ) + def _request_url_via_proxy(self, url): + if not self.SCRAPINGBEE_API_KEY or not self.SCRAPINGBEE_API_URL: + raise RuntimeError("SCRAPINGBEE_API_KEY and SCRAPINGBEE_API_URL not set in environment.") + + base_proxy_params = { + 'api_key': self.SCRAPINGBEE_API_KEY, + 'url': url, + 'premium_proxy': 'true', + 'country_code': 'us', + 'block_resources': 'false', + } + if self.additional_params: + base_proxy_params.update(self.additional_params) + + if self.method == "POST": + self.request["response"] = self.request["session"].post( + self.SCRAPINGBEE_API_URL, + params=base_proxy_params, + data=self.parameters, + ) + else: + self.request["response"] = self.request["session"].get( + self.SCRAPINGBEE_API_URL, + params=base_proxy_params, + ) + if self.save_response: self.save_response(self) diff --git a/juriscraper/opinions/united_states/state/ark.py b/juriscraper/opinions/united_states/state/ark.py index 64c67283d..9af4d5e74 100644 --- a/juriscraper/opinions/united_states/state/ark.py +++ b/juriscraper/opinions/united_states/state/ark.py @@ -5,7 +5,7 @@ import re from datetime import date, datetime, timedelta from typing import Any, Optional -from urllib.parse import urlencode +from urllib.parse import urlencode, urljoin from juriscraper.AbstractSite import logger from juriscraper.lib.string_utils import normalize_dashes, titlecase @@ -13,7 +13,8 @@ class Site(OpinionSiteLinear): - base_url = "https://opinions.arcourts.gov/ark/en/d/s/index.do" + base_url = "https://opinions.arcourts.gov" + base_endpoint = urljoin(base_url, "/ark/en/d/s/index.do") court_code = "144" cite_regex = re.compile(r"\d{2,4} Ark\. \d+", re.IGNORECASE) first_opinion_date = datetime(1979, 9, 3) @@ -25,6 +26,7 @@ def __init__(self, *args, **kwargs): self.court_id = self.__module__ self.set_url() self.make_backscrape_iterable(kwargs) + self.use_proxy = True def _process_html(self) -> None: """Parse HTML into case dictionaries @@ -44,7 +46,7 @@ def _process_html(self) -> None: per_curiam = False name = item.xpath(".//a/text()")[0] - url = item.xpath(".//a/@href")[1] + url = urljoin(self.base_url, item.xpath(".//a/@href")[1]) if re.search(self.not_a_opinion_regex, name.upper()): logger.info("Skipping %s %s, invalid document", name, url) continue @@ -96,7 +98,7 @@ def set_url( "or": "date", "iframe": "true", } - self.url = f"{self.base_url}?{urlencode(params)}" + self.url = urljoin(self.base_endpoint, f"?{urlencode(params)}") def extract_from_text(self, scraped_text: str) -> dict[str, Any]: """Pass scraped text into function and return data as a dictionary diff --git a/juriscraper/opinions/united_states/state/okla.py b/juriscraper/opinions/united_states/state/okla.py index 8b6d5e333..f93effa81 100644 --- a/juriscraper/opinions/united_states/state/okla.py +++ b/juriscraper/opinions/united_states/state/okla.py @@ -22,6 +22,7 @@ def __init__(self, *args, **kwargs): self.status = "Published" self.expected_content_types = ["text/html"] self.should_have_results = True + self.use_proxy = True def _process_html(self): for row in self.html.xpath(".//li[@class='decision']"): From e08c69d95dbc2a24a63c5c01585462addfcdb29e Mon Sep 17 00:00:00 2001 From: luism Date: Thu, 2 Oct 2025 16:59:45 -0400 Subject: [PATCH 2/5] feat: implement residential proxy usage and parameter handling for improved scraping --- CHANGES.md | 2 +- juriscraper/AbstractSite.py | 4 + .../opinions/united_states/state/minn.py | 10 +- .../opinions/united_states/state/miss.py | 122 +++++------------- .../opinions/united_states/state/nm.py | 10 +- 5 files changed, 50 insertions(+), 98 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 8d0cc30a9..46b38d83d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -15,7 +15,7 @@ Releases are also tagged in git, if that's helpful. The following changes are not yet released, but are code complete: Features: -- +- add a residential proxy to AbstractSite to help with sites that block known data center IPs #1616 Changes: - diff --git a/juriscraper/AbstractSite.py b/juriscraper/AbstractSite.py index 5150538b1..a176239ce 100644 --- a/juriscraper/AbstractSite.py +++ b/juriscraper/AbstractSite.py @@ -3,6 +3,7 @@ import os from datetime import date, datetime, timedelta from typing import Union +from urllib.parse import urlencode import certifi import requests @@ -536,6 +537,9 @@ def _request_url_via_proxy(self, url): if not self.SCRAPINGBEE_API_KEY or not self.SCRAPINGBEE_API_URL: raise RuntimeError("SCRAPINGBEE_API_KEY and SCRAPINGBEE_API_URL not set in environment.") + if self.request["parameters"].get("params"): + self.url += "?" + urlencode(self.request["parameters"]["params"]) + base_proxy_params = { 'api_key': self.SCRAPINGBEE_API_KEY, 'url': url, diff --git a/juriscraper/opinions/united_states/state/minn.py b/juriscraper/opinions/united_states/state/minn.py index 45b6d0644..492b41a5e 100644 --- a/juriscraper/opinions/united_states/state/minn.py +++ b/juriscraper/opinions/united_states/state/minn.py @@ -30,11 +30,12 @@ def __init__(self, *args, **kwargs): self.status = "Unpublished" self.url = "https://mn.gov/law-library/search/" - self.params = self.base_params = { + self.params = { "v:sources": "mn-law-library-opinions", "query": f" (url:/archive/{self.court_query}) ", "sortby": "date", } + self.request["verify"] = False self.request["headers"] = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", @@ -47,16 +48,16 @@ def __init__(self, *args, **kwargs): "Referer": "https://mn.gov/law-library/search/?v%3Asources=mn-law-library-opinions&query=+%28url%3A%2Farchive%2Fsupct%29+&citation=&qt=&sortby=&docket=&case=&v=&p=&start-date=&end-date=", "Connection": "keep-alive", } + self.request["parameters"]["params"] = self.params self.make_backscrape_iterable(kwargs) self.needs_special_headers = True + # self.use_proxy = True def _process_html(self) -> None: """Process the html and extract out the opinions :return: None """ - self.html = self._download({"params": self.params}) - # This warning is useful for backscraping results_number = self.html.xpath( "//div[@class='searchresult_number']/text()" @@ -123,7 +124,7 @@ def _process_html(self) -> None: def _download_backwards(self, dates: tuple[date]): logger.info("Backscraping for range %s - %s", *dates) - params = {**self.base_params} + params = {**self.params} params.update( { "start-date": dates[0].strftime("%-m/%-d/%Y"), @@ -132,3 +133,4 @@ def _download_backwards(self, dates: tuple[date]): } ) self.params = params + self.request["parameters"]["params"] = self.params diff --git a/juriscraper/opinions/united_states/state/miss.py b/juriscraper/opinions/united_states/state/miss.py index 5039d3b80..ad92cc1c4 100644 --- a/juriscraper/opinions/united_states/state/miss.py +++ b/juriscraper/opinions/united_states/state/miss.py @@ -1,102 +1,44 @@ # Court Contact: bkraft@courts.ms.gov (see https://courts.ms.gov/aoc/aoc.php) +from urllib.parse import urljoin -import datetime - -from juriscraper.lib.string_utils import convert_date_string from juriscraper.OpinionSiteLinear import OpinionSiteLinear +from datetime import date, timedelta - -# Landing page: https://courts.ms.gov/appellatecourts/sc/scdecisions.php class Site(OpinionSiteLinear): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.domain = "https://courts.ms.gov" self.court_id = self.__module__ - self.method = "POST" - self.number_of_dates_to_process = 5 - self.pages = {} - self.parameters = {"crt": self.get_court_parameter()} + self.method = "GET" + self.publish_date = "09/11/2025" self.status = "Published" - self.url = f"{self.domain}/appellatecourts/docket/gethddates.php" - - def get_court_parameter(self): - return "SCT" - - """Retrieve dates for which there are case listings. - This site's architecture is no bueno. We have to issue - a POST request to this page to get a array (in the form - of a string) or dates that have cases associated with - them. - """ + self.url = f"https://courts.ms.gov/appellatecourts/sc/scdecisions.php?date={self.publish_date}" + self.use_proxy = True + self.additional_params = {'wait_for': '#dispAreaHD > p:nth-child(2) > a'} - def _download(self, request_dict=None): - if request_dict is None: - request_dict = {} - dates_page = super()._download(request_dict) - self.parse_date_pages(dates_page) - - """Keep track of the most recent N date pages. - We dont want to crawl all the way back to 1996, so we only - parse the most recent [self.number_of_dates_to_process] - number of date pages. Since cases are usually published - once a week, this means scraping about the most recent - months worth of cases. - """ - - def parse_date_pages(self, dates_page): - # For testing, each example file should be a specific sub-date page, - # like https://courts.ms.gov/Images/HDList/SCT02-27-2020.html - if self.test_mode_enabled(): - # date below is arbitrary and doesnt matter, it just - # needs to be static for testing to work - self.pages["2020-02-28"] = dates_page - return - for date in self.get_dates_from_date_page(dates_page): - url = "{}/Images/HDList/SCT{}.html".format( - self.domain, - datetime.date.strftime(date, "%m-%d-%Y"), - ) - page = self._get_html_tree_by_url(url) - self.pages[f"{date}"] = page - - """Convert string of dates on page into list of date objects. - """ - - def get_dates_from_date_page(self, dates_page): - dates = [] - substrings = dates_page.text_content().split('"') - for substring in substrings: - try: - dates.append(convert_date_string(substring)) - except ValueError: - pass - dates.sort(reverse=True) - return dates[: self.number_of_dates_to_process] + @staticmethod + def most_recent_release_date(day: int): + """""" + delta = (date.today().weekday() - day) % 7 + return (date.today() - timedelta(days=delta or 7)).strftime("%m/%d/%Y") def _process_html(self): - for date, page in self.pages.items(): - for anchor in page.xpath(".//a[contains(./@href, '.pdf')]"): - parent = anchor.getparent() - - # sometimes the first opinion on the pages is nested - # in a

tag for whatever reason. - while parent.getparent().tag != "body": - parent = parent.getparent() - - sections = parent.xpath("./following-sibling::ul") - if not sections: - # the while loop above should mean we never fall in here - continue - - section = sections[0] - self.cases.append( - { - "date": date, - "docket": anchor.text_content().strip(), - "name": section.xpath(".//b")[0] - .text_content() - .strip(), - "summary": section.text_content().strip(), - "url": anchor.xpath("./@href")[0], - } - ) + """Process the html + + :return: None + """ + for link in self.html.xpath("//div[@id='dispAreaHD']//a[contains(@href, '.pdf')]"): + slug = link.xpath("./@href")[0] + if not slug.startswith("http"): + slug = urljoin("https://courts.ms.gov/images/", slug[3:].replace("\\", "/")) + ul_nodes = link.xpath("./following::ul[1]") + if not ul_nodes: + continue + self.cases.append( + { + "date": self.publish_date, + "docket": link.text_content().strip(), + "name": ul_nodes[0].xpath(".//b")[0].text_content().strip(), + "summary": ul_nodes[0].text_content().strip(), + "url": slug, + } + ) diff --git a/juriscraper/opinions/united_states/state/nm.py b/juriscraper/opinions/united_states/state/nm.py index a4babae39..99e94807f 100644 --- a/juriscraper/opinions/united_states/state/nm.py +++ b/juriscraper/opinions/united_states/state/nm.py @@ -1,7 +1,7 @@ import re from datetime import date, datetime, timedelta from typing import Any, Optional -from urllib.parse import urlencode +from urllib.parse import urlencode, urljoin from juriscraper.AbstractSite import logger from juriscraper.lib.string_utils import titlecase @@ -17,7 +17,7 @@ class Site(OpinionSiteLinear): Additionally, we moved docket number capture to PDF extraction, to limit the number of requests. """ - base_url = "https://nmonesource.com/nmos/en/d/s/index.do" + base_url = "https://nmonesource.com/" court_code = "182" first_opinion_date = datetime(1900, 1, 1) days_interval = 15 @@ -27,6 +27,7 @@ def __init__(self, *args, **kwargs): self.court_id = self.__module__ self.set_url() self.make_backscrape_iterable(kwargs) + self.use_proxy = True def _process_html(self) -> None: """Parse HTML into case dictionaries @@ -46,6 +47,8 @@ def _process_html(self) -> None: url = row.xpath( ".//a[contains(@title, 'Download the PDF version')]/@href" )[0] + url = urljoin(self.base_url, url) + name = row.xpath(".//span[@class='title']/a/text()")[0] date_filed = row.xpath(".//span[@class='publicationDate']/text()")[ 0 @@ -106,7 +109,8 @@ def set_url( "or": "date", "iframe": "true", } - self.url = f"{self.base_url}?{urlencode(params)}" + + self.url = urljoin(self.base_url, "nmos/en/d/s/index.do") + f"?{urlencode(params)}" def _download_backwards(self, dates: tuple[date]) -> None: """Make custom date range request From a7f18f730443cce0f0e39d95c27b0214eb91d2e4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 21:05:23 +0000 Subject: [PATCH 3/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- juriscraper/AbstractSite.py | 16 +++++++------- .../opinions/united_states/state/miss.py | 21 ++++++++++++++----- .../opinions/united_states/state/nm.py | 5 ++++- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/juriscraper/AbstractSite.py b/juriscraper/AbstractSite.py index a176239ce..1cfa9a72c 100644 --- a/juriscraper/AbstractSite.py +++ b/juriscraper/AbstractSite.py @@ -446,7 +446,7 @@ def download_content( "render_js": "false", "cookies": self.cookies, "country_code": "us", - 'premium_proxy': 'true', + "premium_proxy": "true", } r = s.get( @@ -535,17 +535,19 @@ def _request_url(self, url): def _request_url_via_proxy(self, url): if not self.SCRAPINGBEE_API_KEY or not self.SCRAPINGBEE_API_URL: - raise RuntimeError("SCRAPINGBEE_API_KEY and SCRAPINGBEE_API_URL not set in environment.") + raise RuntimeError( + "SCRAPINGBEE_API_KEY and SCRAPINGBEE_API_URL not set in environment." + ) if self.request["parameters"].get("params"): self.url += "?" + urlencode(self.request["parameters"]["params"]) base_proxy_params = { - 'api_key': self.SCRAPINGBEE_API_KEY, - 'url': url, - 'premium_proxy': 'true', - 'country_code': 'us', - 'block_resources': 'false', + "api_key": self.SCRAPINGBEE_API_KEY, + "url": url, + "premium_proxy": "true", + "country_code": "us", + "block_resources": "false", } if self.additional_params: base_proxy_params.update(self.additional_params) diff --git a/juriscraper/opinions/united_states/state/miss.py b/juriscraper/opinions/united_states/state/miss.py index ad92cc1c4..da391a088 100644 --- a/juriscraper/opinions/united_states/state/miss.py +++ b/juriscraper/opinions/united_states/state/miss.py @@ -1,8 +1,9 @@ # Court Contact: bkraft@courts.ms.gov (see https://courts.ms.gov/aoc/aoc.php) +from datetime import date, timedelta from urllib.parse import urljoin from juriscraper.OpinionSiteLinear import OpinionSiteLinear -from datetime import date, timedelta + class Site(OpinionSiteLinear): def __init__(self, *args, **kwargs): @@ -13,7 +14,9 @@ def __init__(self, *args, **kwargs): self.status = "Published" self.url = f"https://courts.ms.gov/appellatecourts/sc/scdecisions.php?date={self.publish_date}" self.use_proxy = True - self.additional_params = {'wait_for': '#dispAreaHD > p:nth-child(2) > a'} + self.additional_params = { + "wait_for": "#dispAreaHD > p:nth-child(2) > a" + } @staticmethod def most_recent_release_date(day: int): @@ -26,10 +29,15 @@ def _process_html(self): :return: None """ - for link in self.html.xpath("//div[@id='dispAreaHD']//a[contains(@href, '.pdf')]"): + for link in self.html.xpath( + "//div[@id='dispAreaHD']//a[contains(@href, '.pdf')]" + ): slug = link.xpath("./@href")[0] if not slug.startswith("http"): - slug = urljoin("https://courts.ms.gov/images/", slug[3:].replace("\\", "/")) + slug = urljoin( + "https://courts.ms.gov/images/", + slug[3:].replace("\\", "/"), + ) ul_nodes = link.xpath("./following::ul[1]") if not ul_nodes: continue @@ -37,7 +45,10 @@ def _process_html(self): { "date": self.publish_date, "docket": link.text_content().strip(), - "name": ul_nodes[0].xpath(".//b")[0].text_content().strip(), + "name": ul_nodes[0] + .xpath(".//b")[0] + .text_content() + .strip(), "summary": ul_nodes[0].text_content().strip(), "url": slug, } diff --git a/juriscraper/opinions/united_states/state/nm.py b/juriscraper/opinions/united_states/state/nm.py index 99e94807f..eec991ae8 100644 --- a/juriscraper/opinions/united_states/state/nm.py +++ b/juriscraper/opinions/united_states/state/nm.py @@ -110,7 +110,10 @@ def set_url( "iframe": "true", } - self.url = urljoin(self.base_url, "nmos/en/d/s/index.do") + f"?{urlencode(params)}" + self.url = ( + urljoin(self.base_url, "nmos/en/d/s/index.do") + + f"?{urlencode(params)}" + ) def _download_backwards(self, dates: tuple[date]) -> None: """Make custom date range request From 0dd45407aed45982325bdb188af5ff654a48fc06 Mon Sep 17 00:00:00 2001 From: luism Date: Thu, 9 Oct 2025 15:27:11 -0400 Subject: [PATCH 4/5] feat(mo): update URL handling to use urljoin and enable proxy usage --- juriscraper/opinions/united_states/state/mo.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/juriscraper/opinions/united_states/state/mo.py b/juriscraper/opinions/united_states/state/mo.py index 18d84a810..c255a06a7 100644 --- a/juriscraper/opinions/united_states/state/mo.py +++ b/juriscraper/opinions/united_states/state/mo.py @@ -6,6 +6,7 @@ """ from datetime import date +from urllib.parse import urljoin from juriscraper.OpinionSiteLinear import OpinionSiteLinear @@ -15,12 +16,14 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.court_id = self.__module__ self.court = "Supreme" + self.base_url = "https://www.courts.mo.gov" self.url = self.build_url() self.status = "Published" + self.use_proxy = True def build_url(self): year = date.today().year - return f"https://www.courts.mo.gov/page.jsp?id=12086&dist=Opinions%20{self.court}&date=all&year={year}#all" + return urljoin(self.base_url, f"/page.jsp?id=12086&dist=Opinions%20{self.court}&date=all&year={year}#all") def _process_html(self): for row in self.html.xpath("//div[@class='margin-bottom-15']"): @@ -29,7 +32,7 @@ def _process_html(self): links = opinion.xpath("a") if len(links) != 2: continue - url = opinion.xpath("a")[1].get("href") + url = urljoin(self.base_url,opinion.xpath("a")[1].get("href")) all_text = opinion.xpath(".//text()") case_metadata = [t.strip() for t in all_text if t.strip()] docket, _, name, _, author, _, vote = case_metadata From 9ac9f0f360d9ad576a7571860c6ac182efd685ab Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Oct 2025 19:27:31 +0000 Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- juriscraper/opinions/united_states/state/mo.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/juriscraper/opinions/united_states/state/mo.py b/juriscraper/opinions/united_states/state/mo.py index c255a06a7..5b5342701 100644 --- a/juriscraper/opinions/united_states/state/mo.py +++ b/juriscraper/opinions/united_states/state/mo.py @@ -23,7 +23,10 @@ def __init__(self, *args, **kwargs): def build_url(self): year = date.today().year - return urljoin(self.base_url, f"/page.jsp?id=12086&dist=Opinions%20{self.court}&date=all&year={year}#all") + return urljoin( + self.base_url, + f"/page.jsp?id=12086&dist=Opinions%20{self.court}&date=all&year={year}#all", + ) def _process_html(self): for row in self.html.xpath("//div[@class='margin-bottom-15']"): @@ -32,7 +35,7 @@ def _process_html(self): links = opinion.xpath("a") if len(links) != 2: continue - url = urljoin(self.base_url,opinion.xpath("a")[1].get("href")) + url = urljoin(self.base_url, opinion.xpath("a")[1].get("href")) all_text = opinion.xpath(".//text()") case_metadata = [t.strip() for t in all_text if t.strip()] docket, _, name, _, author, _, vote = case_metadata