diff --git a/CHANGES.md b/CHANGES.md index 8d0cc30a9..83e603112 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -15,7 +15,8 @@ Releases are also tagged in git, if that's helpful. The following changes are not yet released, but are code complete: Features: -- +- add a residential proxy to AbstractSite to help with sites that block known data center IPs #1616 +- add scraper for Louisiana Court of Appeal, Third Circuit #1455 Changes: - diff --git a/juriscraper/AbstractSite.py b/juriscraper/AbstractSite.py index 7f5fb12dd..1cfa9a72c 100644 --- a/juriscraper/AbstractSite.py +++ b/juriscraper/AbstractSite.py @@ -3,6 +3,7 @@ import os from datetime import date, datetime, timedelta from typing import Union +from urllib.parse import urlencode import certifi import requests @@ -50,6 +51,7 @@ def __init__(self, cnt=None, **kwargs): super().__init__() # Computed metadata + self.additional_params = None self.hash = None self.html = None self.method = "GET" @@ -72,6 +74,9 @@ def __init__(self, cnt=None, **kwargs): "status": None, "url": None, } + self.use_proxy = False + self.SCRAPINGBEE_API_KEY = os.environ.get("SCRAPINGBEE_API_KEY", None) + self.SCRAPINGBEE_API_URL = os.environ.get("SCRAPINGBEE_API_URL", None) # Attribute to reference a function passed by the caller, # which takes a single argument, the Site object, after @@ -385,10 +390,10 @@ def _download(self, request_dict=None): if self.test_mode_enabled(): self._request_url_mock(self.url) - elif self.method == "GET": - self._request_url_get(self.url) - elif self.method == "POST": - self._request_url_post(self.url) + elif self.use_proxy: + self._request_url_via_proxy(self.url) + else: + self._request_url(self.url) self._post_process_response() return self._return_response_text_object() @@ -434,13 +439,28 @@ def download_content( # Note that we do a GET even if self.method is POST. This is # deliberate. - r = s.get( - download_url, - verify=has_cipher, # WA has a certificate we don't understand - headers=headers, - cookies=self.cookies, - timeout=300, - ) + if self.use_proxy: + params = { + "api_key": self.SCRAPINGBEE_API_KEY, + "url": download_url, + "render_js": "false", + "cookies": self.cookies, + "country_code": "us", + "premium_proxy": "true", + } + + r = s.get( + self.SCRAPINGBEE_API_URL, + params=params, + ) + else: + r = s.get( + download_url, + verify=has_cipher, # WA has a certificate we don't understand + headers=headers, + cookies=self.cookies, + timeout=300, + ) # test for empty files (thank you CA1) if len(r.content) == 0: @@ -494,32 +514,56 @@ def _process_request_parameters(self, parameters=None): del parameters["verify"] self.request["parameters"].update(parameters) - def _request_url_get(self, url): - """Execute GET request and assign appropriate request dictionary - values - """ + def _request_url(self, url): + """Execute GET or POST request and assign appropriate request dictionary values""" self.request["url"] = url - self.request["response"] = self.request["session"].get( - url, + session = self.request["session"] + request_args = dict( + url=url, headers=self.request["headers"], verify=self.request["verify"], timeout=60, **self.request["parameters"], ) + if self.method == "POST": + request_args["data"] = self.parameters + self.request["response"] = session.post(**request_args) + else: + self.request["response"] = session.get(**request_args) if self.save_response: self.save_response(self) - def _request_url_post(self, url): - """Execute POST request and assign appropriate request dictionary values""" - self.request["url"] = url - self.request["response"] = self.request["session"].post( - url, - headers=self.request["headers"], - verify=self.request["verify"], - data=self.parameters, - timeout=60, - **self.request["parameters"], - ) + def _request_url_via_proxy(self, url): + if not self.SCRAPINGBEE_API_KEY or not self.SCRAPINGBEE_API_URL: + raise RuntimeError( + "SCRAPINGBEE_API_KEY and SCRAPINGBEE_API_URL not set in environment." + ) + + if self.request["parameters"].get("params"): + self.url += "?" + urlencode(self.request["parameters"]["params"]) + + base_proxy_params = { + "api_key": self.SCRAPINGBEE_API_KEY, + "url": url, + "premium_proxy": "true", + "country_code": "us", + "block_resources": "false", + } + if self.additional_params: + base_proxy_params.update(self.additional_params) + + if self.method == "POST": + self.request["response"] = self.request["session"].post( + self.SCRAPINGBEE_API_URL, + params=base_proxy_params, + data=self.parameters, + ) + else: + self.request["response"] = self.request["session"].get( + self.SCRAPINGBEE_API_URL, + params=base_proxy_params, + ) + if self.save_response: self.save_response(self) diff --git a/juriscraper/opinions/united_states/state/__init__.py b/juriscraper/opinions/united_states/state/__init__.py index ea3493009..fbb054968 100644 --- a/juriscraper/opinions/united_states/state/__init__.py +++ b/juriscraper/opinions/united_states/state/__init__.py @@ -63,6 +63,7 @@ "la", "lactapp_1", "lactapp_2", + "lactapp_3", "lactapp_4", "lactapp_5", "mass", diff --git a/juriscraper/opinions/united_states/state/ark.py b/juriscraper/opinions/united_states/state/ark.py index 64c67283d..9af4d5e74 100644 --- a/juriscraper/opinions/united_states/state/ark.py +++ b/juriscraper/opinions/united_states/state/ark.py @@ -5,7 +5,7 @@ import re from datetime import date, datetime, timedelta from typing import Any, Optional -from urllib.parse import urlencode +from urllib.parse import urlencode, urljoin from juriscraper.AbstractSite import logger from juriscraper.lib.string_utils import normalize_dashes, titlecase @@ -13,7 +13,8 @@ class Site(OpinionSiteLinear): - base_url = "https://opinions.arcourts.gov/ark/en/d/s/index.do" + base_url = "https://opinions.arcourts.gov" + base_endpoint = urljoin(base_url, "/ark/en/d/s/index.do") court_code = "144" cite_regex = re.compile(r"\d{2,4} Ark\. \d+", re.IGNORECASE) first_opinion_date = datetime(1979, 9, 3) @@ -25,6 +26,7 @@ def __init__(self, *args, **kwargs): self.court_id = self.__module__ self.set_url() self.make_backscrape_iterable(kwargs) + self.use_proxy = True def _process_html(self) -> None: """Parse HTML into case dictionaries @@ -44,7 +46,7 @@ def _process_html(self) -> None: per_curiam = False name = item.xpath(".//a/text()")[0] - url = item.xpath(".//a/@href")[1] + url = urljoin(self.base_url, item.xpath(".//a/@href")[1]) if re.search(self.not_a_opinion_regex, name.upper()): logger.info("Skipping %s %s, invalid document", name, url) continue @@ -96,7 +98,7 @@ def set_url( "or": "date", "iframe": "true", } - self.url = f"{self.base_url}?{urlencode(params)}" + self.url = urljoin(self.base_endpoint, f"?{urlencode(params)}") def extract_from_text(self, scraped_text: str) -> dict[str, Any]: """Pass scraped text into function and return data as a dictionary diff --git a/juriscraper/opinions/united_states/state/lactapp_3.py b/juriscraper/opinions/united_states/state/lactapp_3.py new file mode 100644 index 000000000..e067efb21 --- /dev/null +++ b/juriscraper/opinions/united_states/state/lactapp_3.py @@ -0,0 +1,115 @@ +"""Scraper for Louisiana Court of Appeal, Third Circuit +CourtID: lactapp_3 +Court Short Name: La. Ct. App. 3rd Cir. +Author: Luis-manzur +History: + 2025-10-06: Created by Luis-manzur +""" + +from datetime import date, datetime +from urllib.parse import urljoin + +from juriscraper.lib.date_utils import unique_year_month +from juriscraper.lib.string_utils import titlecase +from juriscraper.OpinionSiteLinear import OpinionSiteLinear + + +class Site(OpinionSiteLinear): + first_opinion_date = datetime(2003, 9, 3) + days_interval = 1 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.court_id = self.__module__ + self.base_url = "https://www.la3circuit.org" + self.url = urljoin(self.base_url, "index.aspx") + current_year = str(datetime.today().year) + current_month = datetime.today().strftime("%B") + self.status = "Published" + self.method = "POST" + self.parameters = { + "__EVENTTARGET": "ctl00$MainContent$btnSearchOpinionsByMonthYear", + "__EVENTARGUMENT": "", + "__VIEWSTATE": "D92S8q+xnTcxFQyxsnxizwqJWnruAi4MVz+8UupGHeg6OPML/GC8kPerqWdgwbOLFd91thSKtLN+e/mPyfY/IME7riCZdoY7QIp0qK1yymEP017OFrxdWr7t2g/8p5hwmWbyontMq74IDIFPqHsTpe1j2pDhqECa1cT7wNh1lXggzCEv+XE66Jj5u/1zVNjWzqzNB5S0tu9yNK2fkMq8X7SyZxhxJJQjim8q30jEm/udHsM4up9SyLJuAycWVubb1W4vTmWfi38+2GSm/w7SIS6JkfqJFUrcWsWqwH2alAn2RyC0XVy0/kHw83ourCU/DJqr6hVvaeGE88VbI6HXLzsOo4oLBz/mULjdjEOroB5zCEHv0VKanq+JGh6Eo3qHnf0sK+izN8lojqvuBYkiXSMkzg2ZwgxIkSVThK2SqbSYYmFzz5xnlw/4WABzW2NHkDrTlJor/hkiWDS8XOxfPs1nFqHyWp/TsdhFtT4Yw3rdcpWCv2DRLoioB93RaT1aevEJ9DUq6TSVQ4yrNfdn05SZ2uKAg873QaB1Cx/U9uG/UjC4JrLyeLjwwgU8/xptFTasXaefCxusiEeyNStAj87aQpHE6mjaPCxwLndWSCmfZ9c9pjmi/siqWx4sNLBa3L1haNFL9UOLGqoPVm/bIR4xV/yhAPvjhOQ9V+GOljf2AlYbQ4/o/rfqJVkxqhK4HrZ9NOQq5o8OmPmgLjn1mk7o8jJffRe4taIJ5GzJ9Z66XyHxpCLUf8eeeIszT7QK6ATYEnZxyqyRoc6SBOn96edncultTn5pWNGwNT8n5FpW/evBtOrWd9nVxluk72GM987QNhQXdsleP0x/Atd7F2pwkfZ/ZEgl2yyxVYwJukdJh6daUVAXFqyPb79Fwj72WAAJkQSWNWtTJIJJ2osUiO4+eiEXUNGiQQueNNCCVaU2F9O8dkGzkfia2hbGgWXekLnKF2FMin5X9E/gzz+gqCCJ2twd+2ba1hOEpl6fMsoUB3FZ5kJhYnhXOQDERwd/B75T09GA29NVK8pJstmz7IIKhdav+f2tSQukkEKGx9wcbeQTCFLznrtoIR9ktblyeJiFw0kjFf9SE8TrAEOE2Xa8UJRP/2ykqt0UQrcwrO+rDAOPVSVfQIFQwX6Mp1yosMp0PmJmgRmPReDDALulheLoffJ1R4uTsgNfgmKbuFZX5IuBX9eQ3JbaKeLTiXEwZL5590MCnyj5ZwiURxOr08U6onno9/m96LYAJdxGNrtpIJBRyc8jMQfMFroV+nsMUCeJQoTo9P3B6Il6YkLI7ICZ+iMGnwomFBGWvHQr7/pRzzDARX8RsJ8Caejbfb/oqG3SH3On1Ltnnh8dmlaFQWqPDEP65kAXx20TYC3yiBdfv4S9uZDun5lmHYoByJH0GPUiErOSOh8KMnCvdeQ2IA6Q9LkKcODJiKBPkgc5GWmrMO3AXYXKJtFgkNkH91alma4bfG9LOL1Xii5lDIYfb3jRV4BifXwvWswgQY7ntPbHJhCTaHPuG1WfVOuJ/hg8I4yM4EN5eCA6IesOuS+gXEQ9YSJGgRe9aIFLWFfAY5hOwCDVf0H3rvkXHML6X1jgSIg+b4fN/Nex63QMhRm+2gKqGZZlOM4WBm/7TsW3yCpYop/S5aWRdAjaltnRQUYoFrrg8+RHuEGUlHsLZD7Qsj/EB4tuXEvI+ixVshYnQkMryGKh0XcRd92I3rAQhKfsWX17jrb0ADCPNdkRSdLUpu5N1RJ/+v5IjpUkqkOIFie5pRpi3+vAsRfbv3xYxqYlt/CA/67rgp0ta2vNzrLMwnZmFJqpH+n7cJu9wWC8wSBPPV24LTo+yeb8KdPUMId0D00Vn4gAu5fhnsx//QpVCASCjCZ7QEeDv4gXtqxujiQRgiZ8LsG5d0ValkUVlaJocVg6P2id310HnTsEYd8FZ0IZ5+GEjXUDrOwvr1MbO425xk3RCLBG4SC/n3eosF/jG/ZDXILhMKnkG8BxyhooJCkii3gE30wVxm/dStrpC2wm4xsf51GVtVdi1+qaA37HmwWwP4eHwVWBviqcVqs2CzU7P1EgD1glkig6j0TZIJrhFnHPEW6Har0MWiS3ey4126A/BJ3IUeUzX2wYUEHXsKaoEbB1f3jCRxzqGI/SiD0f3j73kmce7E0Q4ys0xbVehdUhphtNK/mQOp/5NPA9hR+JwYyn8eVgt7dsivt+l8tdUzsejcVOUa1Fk05cna89CLL8pZipS0Phi/L+pAD7byg8lsais6OUnAYmsA89sKiYRC4ihQZ4IfckAgltjkLIFUaJU21bMeWIk4PIJnG8rv+hLMABhX5lNMNR+9Vu9DGnjfCWVoGHnAD7GrZRpez42JlhW+RGNZBnOx+vFdyrNASK4KwtqJnw2hbQplW1sJrn3N1gT6rp/Q08NhNECU4BG4CMHetInaBiK1VDrdzgo0pfdUwK24Vh+rgVobWqmunikwMPyMvoUZPRgYGaBkJRMO8V8+YOD4175tpTf5U1pH3q7B+iSueqRWznKiFvd9DgUqzL7iv4H06coG+AnEYjnGyjLBKTccStQ0ajAbZI+yBRGay/YKKJXBVwDyvmR75uq2NNl2GZ6i34IZvXHZhlDwjkpyuh8DDQcrU25ibfZoRI16KJT/T8t3MyJ0oUyep8PsrAHosMSmOw7Z/w7ElTJwA7AZTK5Dg1qIlFkIEWaJVCdrGpCuma2WrfcLa4N+2AHC0SRWbkNQQjg7fJg4k0IvtH+XiK/8UjkxMLT2nuh2usLATho2tbDeB73JZNxN9csSBIUeU5IooRBo9vCxFl", + "__VIEWSTATEGENERATOR": "90059987", + "__EVENTVALIDATION": "HhVqD/wjDkVoLffsjh3840YC/bvC7S8ylajc2SA1uLnjOgB/XhpyKxJg9O4r3sywogdCKgJsWDBDgJ8uvquVE7C7Tjnwp5Uoo7UBSPYVvivLodjqyeK7MRXppjwgG6GkjSwhStywWqxazhbrpQfG5/rMC2EkojKtwhVMJZRbP9g9sN8qm/dlp4IrRoJS2xkSDWEJqCfTFZDIE7iM4yuux9TSLAwVRY/oYBJVe3l8vmHKKOtBQds6hlCODMyiUKPcyJf6ZwhNVnUZEhS6qUlcGz/l2/4IMTluBa7f7HEaIA1JXUPM/yu6ICytIXS82ujPrp67lc4JTPOnrE806LoVcVFsUGqDqdbh1EVbw3+vINvdhUnoE3H0WkVnUaR+Hc4zIRNwCvHsnud0VnKA8JKXXktGzVRDYYCYO8gWZduZd+qptF0hHOKEDr6hPTWE8FBz95f/RI/N7VVXzw3Lv1YiKMTsHWwEWdGLU7El86NatDC8PH4t/x6vJd3adAIFjCo29jRvzhIB5sjkc55LCrYcmPOEf3RpWs4KuDExTPaPUnDX70kbo/jmSIw5YDWL7QBVCuI6a6ogo/FVa6Z4i/e0lllKYFYo/Np+BmCT2Gf53dtIDEYS+pPJ/B57Kbt79EQTgs3fDn5ZND/rwVdnRDbWridgY1a9FJvD4AYXejA25hiwY+sca/OekZ4NTpAycHi9iFm48ymJx0Qm8U36q34Z0Bw6iscLxeZcWvvd5lZSr0AGxcARSmv4vtY0BoEZCGdExaZl0sno6ZJSW82ZC+1l+Mba2qygS+zDDwnLu1EweycKsPp0qHbeVvMek/BQzhPdetjfrV5ju0Nf7zCaZIDXqVseKqKfDzMNQP33HL2El744mBZK2et4ojPpSreSFnxLHjHGnr1qxMiuk6NK02TFqFSdJEFXPvq66v8CPzvoECHV4RI1WUeL92POBvHvsGnRYbPT9XLPnaUr7ImEH2B9LYngdLq5D3cvcjSseO6hIb55BUi4lZgO3KvegWbPxluV9ZS9ZOihz714spM1GHnMlGpYEQQDeYL642cnzN1d55mGC61bblbwrwFwI/Hvl2XhfO0Mqwr+qx8yjXNKwBwZOsc31w9EhaMLKXMDDVbWSFm3NthmXaf4c2R5yNmaqVvNJ0OFsv/eDwDqApN67QMSMyZ7i46A0C+AwEWIw5uFScJCudwIRZd4l70Wbtkm0IdAgODC8++1gUTCCWnohY2Q4CeIQZEi3FcC+BdvFuv1iaMuFSo9aJwBmBNEVxK8pJTveUzk5m3kWkdmEz+UZMgrSeWtZeYDlQ1NWUvGT1aIbhQSBK1QqmN/pUTC1Yq8y9klCln7LJBZOvnVG4oyk9fxxw8nb2tClXjHXzYMMSRQ+w4Y8pqBrbxdNs09gxPI93kbz9TlHKNV4VCOOCbzgYRN/enwM30+Faug8srPkIXo1YlAk41ZHyo0CAPGRDgktKOLGVvDJBSPKTfZevMFdgPzdhleSt9t1jlTkb/JEV0hgjG57ulx9iGTbQA2FLGkiAiDkDPeKaOzyUOVls2zYZ4O663bh2NxTRpNC71ARA5GbyWjaRTzWyXANpYIxEr6831OCayAQRTBB+2p/nN3Mr7fUgz9SGgYFXGEwKN1s/xP/47AoQWLq3ICBLHP7W7yQ0MDkYKvInZuqqNlwVOQUVqDbVEJPsBPalcjoXOJ/HlDyZqRet5CqNdLyqb0rwOfkwcrdlqO2LScWsJrHnTIjT6AaChVtiA56xeTqmGIFrLGTi3ielQqtWpQcCEH64kRNmT4ehBzCOlvXPkDWSwlncr4IfOPv4H215DC+ns1dBC9E0+2T7c5SdavI15v53MdNuunbNcf35f4dkfWRIMy/rsCf30prFvRP2KnuA5Ru6de0hxQF1UuK1eaYIvrME6TV5Vcc4reiaWfI2LG+2hreD1oT+IG0WOOMwO8SUtglCaUVyZ9ienHU31uZ7bAjdkqyjT8AIUQrQ2MCNQJCdgExs7qIbAeGKE04VCgKyt8EEgEGlzrkERnuW03LtIaVZrG05Y0+VlDY+/5HNrccSQu35o4cuYcDnLlqbE9PRu9G/qB4E53wDqVOJgKR04Y8MQhRNVK2K4dFI+F3AoJe6z4oVRiQXvOchnHqSmc9TAnkUu0zVC1c+0vCS6daRyqrOHi5ZDf2KgT156NgkVEpUlS4R7U7+zudv6lSth9iGxoG7cqlBwek+4KEUMb7On6rZuB5ucLj4TaOkM6CGwloVwDBh5G+Jv7pXc3G1PwNExa4Y6MsaNkWWtRwGPtCZ/pEHbfroQRQiPd7mJp7hsErAxEtgitMkYge24h3fzHPg6lp6ueqt5KPkQncLDV9tgWXJsB9BdGBQwoiCfjKnyQIZSnRoqAUt+lK2bQMSIXvUucxCqZ2yDJvoKlcW2S9kADxYvjxEpb1S3byHhsvwFlgYfZtd7mXiwz1K9KT/AmR0gpDiwgNQXjAv3pBphRhWj+OaMa4pKniz7nIPwZMPCHzXlGMAC/o/xB9etyjiVpNQA/pFhuHiGqwTRdbqKi5owYkVpA0PlZBTeKNDVFRdhZ4+GwUJKT09xNK9ibvcTLOeliNaQ+8mm2wiXep6CZWgw537GiFh1KjWUwUY6tPxzxzuOmMfxsiqelaZslR38aBHs4VYL8TpSplOt7wTUhPoPKNk5tUYl9C7y7lGcChdR879xE7EOgE2WtULL5SubLkyua4eAnMyozaWacXgBHbfepbvB2RHy17vtt78ysRQDYO/bqyDA7BQJwzL8jNtSbHFoozV8RIbv7R5D5JWi5v8Cs5GiW2JGIeD8MwtQcAYoOzNMQzdAq3POyLOes9viyE5rKdHIZUKQ4v1xOZxreOy3fFTgNXJsUpQMKJF4ICaY8DFpWwjejStMejVkuilPC1vyEry1/degBp219BvPWhDrwwXh9YpwwKJhrJcg57fDHJ7/P34DXg/8zLGqdq8TDhqHOEc/TAYXLmqsG75RwN2j4KAhYS+EIdS5dvdHyszGGLm+FzBHduZKToOKuFqtjzTbgawM5NsYC5CLwxpjr8QIQctdUOJsmQ+6Y2LTjhSlDbA1oVMSom80We/3Ae0BDXJSWPAqxTOEw2ptLwtIj1lGM4QnUOMcMEqda0Jfqn/1etsbpaNELZQ==", + "ctl00$MainContent$ddlSearchOpinions1_Year": current_year, + "ctl00$MainContent$ddlSearchOpinions2_Year": current_year, + "ctl00$MainContent$ddlSearchOpinions2_Month": current_month, + } + self.use_proxy = True + self.make_backscrape_iterable(kwargs) + + def _process_html(self) -> None: + """Process the HTML to extract case details. + + :return None + """ + + rows = self.html.xpath( + "(//table[contains(@class, 'table table-striped table-responsive')])[6]//tr" + ) + + for row in rows: + hrefs = row.xpath( + ".//a[contains(text(), 'Download Opinion')]/@href" + ) + if not hrefs: + continue + download_url = urljoin(self.base_url, hrefs[0]) + docket = row.xpath(".//strong[1]/text()")[0] + date_raw = row.xpath( + ".//strong[contains(text(), 'Opinion Date:')]/following-sibling::text()[1]" + )[0].strip() + date = f"{date_raw[:2]}/{date_raw[2:4]}/{date_raw[4:]}" + name = row.xpath( + ".//strong[contains(text(), 'Case Title:')]/following-sibling::text()[1]" + )[0].strip() + lower_court = row.xpath( + ".//strong[contains(text(), 'Lower Court:')]/following-sibling::text()[1]" + )[0].strip() + + self.cases.append( + { + "docket": docket, + "date": date, + "name": titlecase(name), + "lower_court": lower_court, + "url": download_url, + } + ) + + def _download_backwards(self, search_date: date) -> None: + """Download and process HTML for a given target date. + + :param search_date (date): The date for which to download and process opinions. + :return None; sets the target date, downloads the corresponding HTML + and processes the HTML to extract case details. + """ + + self.parameters.update( + { + "ctl00$MainContent$ddlSearchOpinions1_Year": str( + search_date.year + ), + "ctl00$MainContent$ddlSearchOpinions2_Year": str( + search_date.year + ), + "ctl00$MainContent$ddlSearchOpinions2_Month": search_date.strftime( + "%B" + ), + } + ) + self.html = self._download() + self._process_html() + + def make_backscrape_iterable(self, kwargs) -> None: + """Make back scrape iterable + + :param kwargs: the back scraping params + :return: None + """ + super().make_backscrape_iterable(kwargs) + self.back_scrape_iterable = unique_year_month( + self.back_scrape_iterable + ) diff --git a/juriscraper/opinions/united_states/state/minn.py b/juriscraper/opinions/united_states/state/minn.py index 45b6d0644..492b41a5e 100644 --- a/juriscraper/opinions/united_states/state/minn.py +++ b/juriscraper/opinions/united_states/state/minn.py @@ -30,11 +30,12 @@ def __init__(self, *args, **kwargs): self.status = "Unpublished" self.url = "https://mn.gov/law-library/search/" - self.params = self.base_params = { + self.params = { "v:sources": "mn-law-library-opinions", "query": f" (url:/archive/{self.court_query}) ", "sortby": "date", } + self.request["verify"] = False self.request["headers"] = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", @@ -47,16 +48,16 @@ def __init__(self, *args, **kwargs): "Referer": "https://mn.gov/law-library/search/?v%3Asources=mn-law-library-opinions&query=+%28url%3A%2Farchive%2Fsupct%29+&citation=&qt=&sortby=&docket=&case=&v=&p=&start-date=&end-date=", "Connection": "keep-alive", } + self.request["parameters"]["params"] = self.params self.make_backscrape_iterable(kwargs) self.needs_special_headers = True + # self.use_proxy = True def _process_html(self) -> None: """Process the html and extract out the opinions :return: None """ - self.html = self._download({"params": self.params}) - # This warning is useful for backscraping results_number = self.html.xpath( "//div[@class='searchresult_number']/text()" @@ -123,7 +124,7 @@ def _process_html(self) -> None: def _download_backwards(self, dates: tuple[date]): logger.info("Backscraping for range %s - %s", *dates) - params = {**self.base_params} + params = {**self.params} params.update( { "start-date": dates[0].strftime("%-m/%-d/%Y"), @@ -132,3 +133,4 @@ def _download_backwards(self, dates: tuple[date]): } ) self.params = params + self.request["parameters"]["params"] = self.params diff --git a/juriscraper/opinions/united_states/state/miss.py b/juriscraper/opinions/united_states/state/miss.py index 5039d3b80..da391a088 100644 --- a/juriscraper/opinions/united_states/state/miss.py +++ b/juriscraper/opinions/united_states/state/miss.py @@ -1,102 +1,55 @@ # Court Contact: bkraft@courts.ms.gov (see https://courts.ms.gov/aoc/aoc.php) +from datetime import date, timedelta +from urllib.parse import urljoin -import datetime - -from juriscraper.lib.string_utils import convert_date_string from juriscraper.OpinionSiteLinear import OpinionSiteLinear -# Landing page: https://courts.ms.gov/appellatecourts/sc/scdecisions.php class Site(OpinionSiteLinear): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.domain = "https://courts.ms.gov" self.court_id = self.__module__ - self.method = "POST" - self.number_of_dates_to_process = 5 - self.pages = {} - self.parameters = {"crt": self.get_court_parameter()} + self.method = "GET" + self.publish_date = "09/11/2025" self.status = "Published" - self.url = f"{self.domain}/appellatecourts/docket/gethddates.php" - - def get_court_parameter(self): - return "SCT" - - """Retrieve dates for which there are case listings. - This site's architecture is no bueno. We have to issue - a POST request to this page to get a array (in the form - of a string) or dates that have cases associated with - them. - """ - - def _download(self, request_dict=None): - if request_dict is None: - request_dict = {} - dates_page = super()._download(request_dict) - self.parse_date_pages(dates_page) - - """Keep track of the most recent N date pages. - We dont want to crawl all the way back to 1996, so we only - parse the most recent [self.number_of_dates_to_process] - number of date pages. Since cases are usually published - once a week, this means scraping about the most recent - months worth of cases. - """ - - def parse_date_pages(self, dates_page): - # For testing, each example file should be a specific sub-date page, - # like https://courts.ms.gov/Images/HDList/SCT02-27-2020.html - if self.test_mode_enabled(): - # date below is arbitrary and doesnt matter, it just - # needs to be static for testing to work - self.pages["2020-02-28"] = dates_page - return - for date in self.get_dates_from_date_page(dates_page): - url = "{}/Images/HDList/SCT{}.html".format( - self.domain, - datetime.date.strftime(date, "%m-%d-%Y"), - ) - page = self._get_html_tree_by_url(url) - self.pages[f"{date}"] = page - - """Convert string of dates on page into list of date objects. - """ - - def get_dates_from_date_page(self, dates_page): - dates = [] - substrings = dates_page.text_content().split('"') - for substring in substrings: - try: - dates.append(convert_date_string(substring)) - except ValueError: - pass - dates.sort(reverse=True) - return dates[: self.number_of_dates_to_process] + self.url = f"https://courts.ms.gov/appellatecourts/sc/scdecisions.php?date={self.publish_date}" + self.use_proxy = True + self.additional_params = { + "wait_for": "#dispAreaHD > p:nth-child(2) > a" + } + + @staticmethod + def most_recent_release_date(day: int): + """""" + delta = (date.today().weekday() - day) % 7 + return (date.today() - timedelta(days=delta or 7)).strftime("%m/%d/%Y") def _process_html(self): - for date, page in self.pages.items(): - for anchor in page.xpath(".//a[contains(./@href, '.pdf')]"): - parent = anchor.getparent() - - # sometimes the first opinion on the pages is nested - # in a

tag for whatever reason. - while parent.getparent().tag != "body": - parent = parent.getparent() - - sections = parent.xpath("./following-sibling::ul") - if not sections: - # the while loop above should mean we never fall in here - continue - - section = sections[0] - self.cases.append( - { - "date": date, - "docket": anchor.text_content().strip(), - "name": section.xpath(".//b")[0] - .text_content() - .strip(), - "summary": section.text_content().strip(), - "url": anchor.xpath("./@href")[0], - } + """Process the html + + :return: None + """ + for link in self.html.xpath( + "//div[@id='dispAreaHD']//a[contains(@href, '.pdf')]" + ): + slug = link.xpath("./@href")[0] + if not slug.startswith("http"): + slug = urljoin( + "https://courts.ms.gov/images/", + slug[3:].replace("\\", "/"), ) + ul_nodes = link.xpath("./following::ul[1]") + if not ul_nodes: + continue + self.cases.append( + { + "date": self.publish_date, + "docket": link.text_content().strip(), + "name": ul_nodes[0] + .xpath(".//b")[0] + .text_content() + .strip(), + "summary": ul_nodes[0].text_content().strip(), + "url": slug, + } + ) diff --git a/juriscraper/opinions/united_states/state/nm.py b/juriscraper/opinions/united_states/state/nm.py index a4babae39..eec991ae8 100644 --- a/juriscraper/opinions/united_states/state/nm.py +++ b/juriscraper/opinions/united_states/state/nm.py @@ -1,7 +1,7 @@ import re from datetime import date, datetime, timedelta from typing import Any, Optional -from urllib.parse import urlencode +from urllib.parse import urlencode, urljoin from juriscraper.AbstractSite import logger from juriscraper.lib.string_utils import titlecase @@ -17,7 +17,7 @@ class Site(OpinionSiteLinear): Additionally, we moved docket number capture to PDF extraction, to limit the number of requests. """ - base_url = "https://nmonesource.com/nmos/en/d/s/index.do" + base_url = "https://nmonesource.com/" court_code = "182" first_opinion_date = datetime(1900, 1, 1) days_interval = 15 @@ -27,6 +27,7 @@ def __init__(self, *args, **kwargs): self.court_id = self.__module__ self.set_url() self.make_backscrape_iterable(kwargs) + self.use_proxy = True def _process_html(self) -> None: """Parse HTML into case dictionaries @@ -46,6 +47,8 @@ def _process_html(self) -> None: url = row.xpath( ".//a[contains(@title, 'Download the PDF version')]/@href" )[0] + url = urljoin(self.base_url, url) + name = row.xpath(".//span[@class='title']/a/text()")[0] date_filed = row.xpath(".//span[@class='publicationDate']/text()")[ 0 @@ -106,7 +109,11 @@ def set_url( "or": "date", "iframe": "true", } - self.url = f"{self.base_url}?{urlencode(params)}" + + self.url = ( + urljoin(self.base_url, "nmos/en/d/s/index.do") + + f"?{urlencode(params)}" + ) def _download_backwards(self, dates: tuple[date]) -> None: """Make custom date range request diff --git a/juriscraper/opinions/united_states/state/okla.py b/juriscraper/opinions/united_states/state/okla.py index 8b6d5e333..f93effa81 100644 --- a/juriscraper/opinions/united_states/state/okla.py +++ b/juriscraper/opinions/united_states/state/okla.py @@ -22,6 +22,7 @@ def __init__(self, *args, **kwargs): self.status = "Published" self.expected_content_types = ["text/html"] self.should_have_results = True + self.use_proxy = True def _process_html(self): for row in self.html.xpath(".//li[@class='decision']"): diff --git a/tests/examples/opinions/united_states/lactapp_3_example.compare.json b/tests/examples/opinions/united_states/lactapp_3_example.compare.json new file mode 100644 index 000000000..45b9744eb --- /dev/null +++ b/tests/examples/opinions/united_states/lactapp_3_example.compare.json @@ -0,0 +1,145 @@ +[ + { + "case_dates": "2025-10-01", + "case_names": "State of Louisiana Versus Reginald Ricardo Parnell Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMTM0b3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "KA -0025-0134", + "lower_courts": "Ninth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "State of Louisiana Versus Marcus Chenier Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNC0wNDU3b3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "KA -0024-0457", + "lower_courts": "Twenty-Seventh Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "State of Louisiana Versus Jevon Noah Figaro Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNC0wNDU2b3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "KA -0024-0456", + "lower_courts": "Twenty-Seventh Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "State in the Interest of M. P.,l. H., I. H., and S.W. Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMTM5b3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "JAC-0025-0139", + "lower_courts": "Thirty-Fifth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Shannon Driver and Jason Boothe Versus Jack F. Owens, Jr. Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMDc5b3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0025-0079", + "lower_courts": "Seventh Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Marie Howerton Versus Linda Howerton & David st.louis Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMTY3b3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0025-0167", + "lower_courts": "Fourteenth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Marcus Hebert Versus Liberty Mutual Insurance Company Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMTMwb3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0025-0130", + "lower_courts": "Fifteenth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Louisiana Energy Gateway, LLC Versus Etc Texas Pipeline, Ltd. Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMDMxb3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0025-0031", + "lower_courts": "Thirtieth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Joseph D. Mills Versus Mmg Archer Institute, LLC Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMDczb3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0025-0073", + "lower_courts": "Fourteenth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Christopher L. Clement, Sr. and Sherry H. Clement Versus State of Louisiana Through the Department of Transportation and Development, Parish of Lafayette and City of Lafayette Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNC0wMjU2b3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0024-0256", + "lower_courts": "Fifteenth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Christian D. Chesson, Plc Versus Logan Ryan Gandy Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMDU3b3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0025-0057", + "lower_courts": "Fourteenth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Alan Perego, as Independent Testamentary of the Estate of Ora Ann Hawkins Perego Versus Kenneth L. Perego II Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMTA2bnAucGRm", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0025-0106", + "lower_courts": "Ninth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Acadiana Renal Physicians, Amc Versus Dr. Rapheal Higginbotham, M.D., Dr. Michael Liu, M.D., Dr. Masoud Yazdi, M.D. Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMTE0bnAucGRm", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0025-0114", + "lower_courts": "Fifteenth Judicial District Court", + "case_name_shorts": "" + } +] \ No newline at end of file diff --git a/tests/examples/opinions/united_states/lactapp_3_example.html b/tests/examples/opinions/united_states/lactapp_3_example.html new file mode 100644 index 000000000..8546f71e7 --- /dev/null +++ b/tests/examples/opinions/united_states/lactapp_3_example.html @@ -0,0 +1,3878 @@ + + + + + + + + Louisiana Court of Appeal, Third Circuit + + + + +

+
+ + + +
+ +
+ + + + + +
+ +
+ + + + + + +
+ + + + +
+ +
+
+ + + + +
+
+

+
E-filed documents no longer require the mailing of an original. +
+ download announcement +

+
+
+ + + +
+

+ image + Welcome to the Louisiana Court of Appeal Third Circuit +

+

The Louisiana Third Circuit Court of Appeal is the largest of five circuit courts of appeal in Louisiana. Its territory consists of 21 parishes in southwest and central Louisiana. The parishes are Acadia, Allen, Avoyelles, Beauregard, Calcasieu, Cameron, Catahoula, Concordia, Evangeline, Grant, Iberia, Jefferson Davis, Lafayette, LaSalle, Natchitoches, Rapides, Sabine, St. Landry, St. Martin, Vernon, and Vermilion.

+

The Third Circuit Court of Appeal was created by Act. No. 561, Section 21(C) of 1958. This act amended Article 7 Section 21 of the Louisiana Constitution in 1921, and established the Third Circuit Court of Appeal with five judges. The Court began operations in Lake Charles on July 1, 1960. The Court was housed in the courthouse annex of the Fourteenth Judicial District Court. Act 10 of 1968 increased the judges from five to six. Act 620 Section 1 of 1977 increased the judges from six to nine and became effective June 1, 1978. Act 801 Section 2(B) of 1987 increased the judges from nine to twelve. The effective date of that act was July 20, 1987. The jurisdiction of the Third Circuit extends over a 21 parish area.

+

Construction of a new courthouse began in January 1990. The Court took occupancy of the building on November 1, 1991. The 3.2 acres on which the Court is located was donated to the State by Mr. A. P. Leonards. The cost of the building was 3.4 million dollars. In 1999 the Court was dedicated & renamed in honor of Judge Albert Tate, the first Chief Judge of the Third Circuit.

+
+
Louisiana Map
+
+
+
+ Records Custodian +

+ Renee R. Simien
+ Clerk of Court
+ Louisiana Third Circuit Court of Appeal
+ 1000 Main Street
+ Lake Charles, LA 70615
+ Email. [email protected] *
+ Phone. (337) 433-9403
+ Fax. (337) 491-2590
+ * Please note [email protected] email is for general inquiries only. Document filings are not permitted.
+

+
+
+
+ + +
+ + +
+
+

+ image + Court Literature Download Court User Guides and Documentation +

+ +
+
+ + + + + +
+ +
+

+ image + Search Records Download Court Current Dockets and Opinions +

+
+ +
+
+
+ + +
+ + +
+ + +
+ +
+
+
+
+ plane +

SignUp To Our Newsletter Subscribe and Get Latest News & Events

+
+
+
+
+ + + +
+ + +
+
+
+
+
+ + +
+ +
+

+ gavel + Language Access Limited English Proficiency Language Access Assistance +

+
+
+

The Louisiana Third Circuit Court of Appeal is committed to providing persons of Limited English Proficiency (LEP) with meaningful access to the court.

+

+ Language Access Coordinator
+ Renee R. Simien
+ Clerk of Court
+ Louisiana Third Circuit Court of Appeal
+ 1000 Main Street
+ Lake Charles, LA 70615
+ Email. [email protected]
+ Phone.
+ Fax. (337) 491-2590
+

+

+ If you need interpretive services or want more information about our language access services, please contact the Clerk’s Office at (337) 433-9403 or [email protected]. + Additional information may be obtained from the Louisiana Supreme Court’s Office of Language Access, available here. + If you have a complaint regarding language access, please click here. + Please follow the links below for assistance in requesting a court interpreter. +

+

+ Language Interpreter Assistance:
+ English
+ Spanish
+ Chinese
+ Vietnamese
+ Arabic
+ French
+ +

+
+ + +
+ + +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +