diff --git a/CHANGES.md b/CHANGES.md index 8d0cc30a9..83e603112 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -15,7 +15,8 @@ Releases are also tagged in git, if that's helpful. The following changes are not yet released, but are code complete: Features: -- +- add a residential proxy to AbstractSite to help with sites that block known data center IPs #1616 +- add scraper for Louisiana Court of Appeal, Third Circuit #1455 Changes: - diff --git a/juriscraper/AbstractSite.py b/juriscraper/AbstractSite.py index 7f5fb12dd..1cfa9a72c 100644 --- a/juriscraper/AbstractSite.py +++ b/juriscraper/AbstractSite.py @@ -3,6 +3,7 @@ import os from datetime import date, datetime, timedelta from typing import Union +from urllib.parse import urlencode import certifi import requests @@ -50,6 +51,7 @@ def __init__(self, cnt=None, **kwargs): super().__init__() # Computed metadata + self.additional_params = None self.hash = None self.html = None self.method = "GET" @@ -72,6 +74,9 @@ def __init__(self, cnt=None, **kwargs): "status": None, "url": None, } + self.use_proxy = False + self.SCRAPINGBEE_API_KEY = os.environ.get("SCRAPINGBEE_API_KEY", None) + self.SCRAPINGBEE_API_URL = os.environ.get("SCRAPINGBEE_API_URL", None) # Attribute to reference a function passed by the caller, # which takes a single argument, the Site object, after @@ -385,10 +390,10 @@ def _download(self, request_dict=None): if self.test_mode_enabled(): self._request_url_mock(self.url) - elif self.method == "GET": - self._request_url_get(self.url) - elif self.method == "POST": - self._request_url_post(self.url) + elif self.use_proxy: + self._request_url_via_proxy(self.url) + else: + self._request_url(self.url) self._post_process_response() return self._return_response_text_object() @@ -434,13 +439,28 @@ def download_content( # Note that we do a GET even if self.method is POST. This is # deliberate. - r = s.get( - download_url, - verify=has_cipher, # WA has a certificate we don't understand - headers=headers, - cookies=self.cookies, - timeout=300, - ) + if self.use_proxy: + params = { + "api_key": self.SCRAPINGBEE_API_KEY, + "url": download_url, + "render_js": "false", + "cookies": self.cookies, + "country_code": "us", + "premium_proxy": "true", + } + + r = s.get( + self.SCRAPINGBEE_API_URL, + params=params, + ) + else: + r = s.get( + download_url, + verify=has_cipher, # WA has a certificate we don't understand + headers=headers, + cookies=self.cookies, + timeout=300, + ) # test for empty files (thank you CA1) if len(r.content) == 0: @@ -494,32 +514,56 @@ def _process_request_parameters(self, parameters=None): del parameters["verify"] self.request["parameters"].update(parameters) - def _request_url_get(self, url): - """Execute GET request and assign appropriate request dictionary - values - """ + def _request_url(self, url): + """Execute GET or POST request and assign appropriate request dictionary values""" self.request["url"] = url - self.request["response"] = self.request["session"].get( - url, + session = self.request["session"] + request_args = dict( + url=url, headers=self.request["headers"], verify=self.request["verify"], timeout=60, **self.request["parameters"], ) + if self.method == "POST": + request_args["data"] = self.parameters + self.request["response"] = session.post(**request_args) + else: + self.request["response"] = session.get(**request_args) if self.save_response: self.save_response(self) - def _request_url_post(self, url): - """Execute POST request and assign appropriate request dictionary values""" - self.request["url"] = url - self.request["response"] = self.request["session"].post( - url, - headers=self.request["headers"], - verify=self.request["verify"], - data=self.parameters, - timeout=60, - **self.request["parameters"], - ) + def _request_url_via_proxy(self, url): + if not self.SCRAPINGBEE_API_KEY or not self.SCRAPINGBEE_API_URL: + raise RuntimeError( + "SCRAPINGBEE_API_KEY and SCRAPINGBEE_API_URL not set in environment." + ) + + if self.request["parameters"].get("params"): + self.url += "?" + urlencode(self.request["parameters"]["params"]) + + base_proxy_params = { + "api_key": self.SCRAPINGBEE_API_KEY, + "url": url, + "premium_proxy": "true", + "country_code": "us", + "block_resources": "false", + } + if self.additional_params: + base_proxy_params.update(self.additional_params) + + if self.method == "POST": + self.request["response"] = self.request["session"].post( + self.SCRAPINGBEE_API_URL, + params=base_proxy_params, + data=self.parameters, + ) + else: + self.request["response"] = self.request["session"].get( + self.SCRAPINGBEE_API_URL, + params=base_proxy_params, + ) + if self.save_response: self.save_response(self) diff --git a/juriscraper/opinions/united_states/state/__init__.py b/juriscraper/opinions/united_states/state/__init__.py index ea3493009..fbb054968 100644 --- a/juriscraper/opinions/united_states/state/__init__.py +++ b/juriscraper/opinions/united_states/state/__init__.py @@ -63,6 +63,7 @@ "la", "lactapp_1", "lactapp_2", + "lactapp_3", "lactapp_4", "lactapp_5", "mass", diff --git a/juriscraper/opinions/united_states/state/ark.py b/juriscraper/opinions/united_states/state/ark.py index 64c67283d..9af4d5e74 100644 --- a/juriscraper/opinions/united_states/state/ark.py +++ b/juriscraper/opinions/united_states/state/ark.py @@ -5,7 +5,7 @@ import re from datetime import date, datetime, timedelta from typing import Any, Optional -from urllib.parse import urlencode +from urllib.parse import urlencode, urljoin from juriscraper.AbstractSite import logger from juriscraper.lib.string_utils import normalize_dashes, titlecase @@ -13,7 +13,8 @@ class Site(OpinionSiteLinear): - base_url = "https://opinions.arcourts.gov/ark/en/d/s/index.do" + base_url = "https://opinions.arcourts.gov" + base_endpoint = urljoin(base_url, "/ark/en/d/s/index.do") court_code = "144" cite_regex = re.compile(r"\d{2,4} Ark\. \d+", re.IGNORECASE) first_opinion_date = datetime(1979, 9, 3) @@ -25,6 +26,7 @@ def __init__(self, *args, **kwargs): self.court_id = self.__module__ self.set_url() self.make_backscrape_iterable(kwargs) + self.use_proxy = True def _process_html(self) -> None: """Parse HTML into case dictionaries @@ -44,7 +46,7 @@ def _process_html(self) -> None: per_curiam = False name = item.xpath(".//a/text()")[0] - url = item.xpath(".//a/@href")[1] + url = urljoin(self.base_url, item.xpath(".//a/@href")[1]) if re.search(self.not_a_opinion_regex, name.upper()): logger.info("Skipping %s %s, invalid document", name, url) continue @@ -96,7 +98,7 @@ def set_url( "or": "date", "iframe": "true", } - self.url = f"{self.base_url}?{urlencode(params)}" + self.url = urljoin(self.base_endpoint, f"?{urlencode(params)}") def extract_from_text(self, scraped_text: str) -> dict[str, Any]: """Pass scraped text into function and return data as a dictionary diff --git a/juriscraper/opinions/united_states/state/lactapp_3.py b/juriscraper/opinions/united_states/state/lactapp_3.py new file mode 100644 index 000000000..e067efb21 --- /dev/null +++ b/juriscraper/opinions/united_states/state/lactapp_3.py @@ -0,0 +1,115 @@ +"""Scraper for Louisiana Court of Appeal, Third Circuit +CourtID: lactapp_3 +Court Short Name: La. Ct. App. 3rd Cir. +Author: Luis-manzur +History: + 2025-10-06: Created by Luis-manzur +""" + +from datetime import date, datetime +from urllib.parse import urljoin + +from juriscraper.lib.date_utils import unique_year_month +from juriscraper.lib.string_utils import titlecase +from juriscraper.OpinionSiteLinear import OpinionSiteLinear + + +class Site(OpinionSiteLinear): + first_opinion_date = datetime(2003, 9, 3) + days_interval = 1 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.court_id = self.__module__ + self.base_url = "https://www.la3circuit.org" + self.url = urljoin(self.base_url, "index.aspx") + current_year = str(datetime.today().year) + current_month = datetime.today().strftime("%B") + self.status = "Published" + self.method = "POST" + self.parameters = { + "__EVENTTARGET": "ctl00$MainContent$btnSearchOpinionsByMonthYear", + "__EVENTARGUMENT": "", + "__VIEWSTATE": "D92S8q+xnTcxFQyxsnxizwqJWnruAi4MVz+8UupGHeg6OPML/GC8kPerqWdgwbOLFd91thSKtLN+e/mPyfY/IME7riCZdoY7QIp0qK1yymEP017OFrxdWr7t2g/8p5hwmWbyontMq74IDIFPqHsTpe1j2pDhqECa1cT7wNh1lXggzCEv+XE66Jj5u/1zVNjWzqzNB5S0tu9yNK2fkMq8X7SyZxhxJJQjim8q30jEm/udHsM4up9SyLJuAycWVubb1W4vTmWfi38+2GSm/w7SIS6JkfqJFUrcWsWqwH2alAn2RyC0XVy0/kHw83ourCU/DJqr6hVvaeGE88VbI6HXLzsOo4oLBz/mULjdjEOroB5zCEHv0VKanq+JGh6Eo3qHnf0sK+izN8lojqvuBYkiXSMkzg2ZwgxIkSVThK2SqbSYYmFzz5xnlw/4WABzW2NHkDrTlJor/hkiWDS8XOxfPs1nFqHyWp/TsdhFtT4Yw3rdcpWCv2DRLoioB93RaT1aevEJ9DUq6TSVQ4yrNfdn05SZ2uKAg873QaB1Cx/U9uG/UjC4JrLyeLjwwgU8/xptFTasXaefCxusiEeyNStAj87aQpHE6mjaPCxwLndWSCmfZ9c9pjmi/siqWx4sNLBa3L1haNFL9UOLGqoPVm/bIR4xV/yhAPvjhOQ9V+GOljf2AlYbQ4/o/rfqJVkxqhK4HrZ9NOQq5o8OmPmgLjn1mk7o8jJffRe4taIJ5GzJ9Z66XyHxpCLUf8eeeIszT7QK6ATYEnZxyqyRoc6SBOn96edncultTn5pWNGwNT8n5FpW/evBtOrWd9nVxluk72GM987QNhQXdsleP0x/Atd7F2pwkfZ/ZEgl2yyxVYwJukdJh6daUVAXFqyPb79Fwj72WAAJkQSWNWtTJIJJ2osUiO4+eiEXUNGiQQueNNCCVaU2F9O8dkGzkfia2hbGgWXekLnKF2FMin5X9E/gzz+gqCCJ2twd+2ba1hOEpl6fMsoUB3FZ5kJhYnhXOQDERwd/B75T09GA29NVK8pJstmz7IIKhdav+f2tSQukkEKGx9wcbeQTCFLznrtoIR9ktblyeJiFw0kjFf9SE8TrAEOE2Xa8UJRP/2ykqt0UQrcwrO+rDAOPVSVfQIFQwX6Mp1yosMp0PmJmgRmPReDDALulheLoffJ1R4uTsgNfgmKbuFZX5IuBX9eQ3JbaKeLTiXEwZL5590MCnyj5ZwiURxOr08U6onno9/m96LYAJdxGNrtpIJBRyc8jMQfMFroV+nsMUCeJQoTo9P3B6Il6YkLI7ICZ+iMGnwomFBGWvHQr7/pRzzDARX8RsJ8Caejbfb/oqG3SH3On1Ltnnh8dmlaFQWqPDEP65kAXx20TYC3yiBdfv4S9uZDun5lmHYoByJH0GPUiErOSOh8KMnCvdeQ2IA6Q9LkKcODJiKBPkgc5GWmrMO3AXYXKJtFgkNkH91alma4bfG9LOL1Xii5lDIYfb3jRV4BifXwvWswgQY7ntPbHJhCTaHPuG1WfVOuJ/hg8I4yM4EN5eCA6IesOuS+gXEQ9YSJGgRe9aIFLWFfAY5hOwCDVf0H3rvkXHML6X1jgSIg+b4fN/Nex63QMhRm+2gKqGZZlOM4WBm/7TsW3yCpYop/S5aWRdAjaltnRQUYoFrrg8+RHuEGUlHsLZD7Qsj/EB4tuXEvI+ixVshYnQkMryGKh0XcRd92I3rAQhKfsWX17jrb0ADCPNdkRSdLUpu5N1RJ/+v5IjpUkqkOIFie5pRpi3+vAsRfbv3xYxqYlt/CA/67rgp0ta2vNzrLMwnZmFJqpH+n7cJu9wWC8wSBPPV24LTo+yeb8KdPUMId0D00Vn4gAu5fhnsx//QpVCASCjCZ7QEeDv4gXtqxujiQRgiZ8LsG5d0ValkUVlaJocVg6P2id310HnTsEYd8FZ0IZ5+GEjXUDrOwvr1MbO425xk3RCLBG4SC/n3eosF/jG/ZDXILhMKnkG8BxyhooJCkii3gE30wVxm/dStrpC2wm4xsf51GVtVdi1+qaA37HmwWwP4eHwVWBviqcVqs2CzU7P1EgD1glkig6j0TZIJrhFnHPEW6Har0MWiS3ey4126A/BJ3IUeUzX2wYUEHXsKaoEbB1f3jCRxzqGI/SiD0f3j73kmce7E0Q4ys0xbVehdUhphtNK/mQOp/5NPA9hR+JwYyn8eVgt7dsivt+l8tdUzsejcVOUa1Fk05cna89CLL8pZipS0Phi/L+pAD7byg8lsais6OUnAYmsA89sKiYRC4ihQZ4IfckAgltjkLIFUaJU21bMeWIk4PIJnG8rv+hLMABhX5lNMNR+9Vu9DGnjfCWVoGHnAD7GrZRpez42JlhW+RGNZBnOx+vFdyrNASK4KwtqJnw2hbQplW1sJrn3N1gT6rp/Q08NhNECU4BG4CMHetInaBiK1VDrdzgo0pfdUwK24Vh+rgVobWqmunikwMPyMvoUZPRgYGaBkJRMO8V8+YOD4175tpTf5U1pH3q7B+iSueqRWznKiFvd9DgUqzL7iv4H06coG+AnEYjnGyjLBKTccStQ0ajAbZI+yBRGay/YKKJXBVwDyvmR75uq2NNl2GZ6i34IZvXHZhlDwjkpyuh8DDQcrU25ibfZoRI16KJT/T8t3MyJ0oUyep8PsrAHosMSmOw7Z/w7ElTJwA7AZTK5Dg1qIlFkIEWaJVCdrGpCuma2WrfcLa4N+2AHC0SRWbkNQQjg7fJg4k0IvtH+XiK/8UjkxMLT2nuh2usLATho2tbDeB73JZNxN9csSBIUeU5IooRBo9vCxFl", + "__VIEWSTATEGENERATOR": "90059987", + "__EVENTVALIDATION": "HhVqD/wjDkVoLffsjh3840YC/bvC7S8ylajc2SA1uLnjOgB/XhpyKxJg9O4r3sywogdCKgJsWDBDgJ8uvquVE7C7Tjnwp5Uoo7UBSPYVvivLodjqyeK7MRXppjwgG6GkjSwhStywWqxazhbrpQfG5/rMC2EkojKtwhVMJZRbP9g9sN8qm/dlp4IrRoJS2xkSDWEJqCfTFZDIE7iM4yuux9TSLAwVRY/oYBJVe3l8vmHKKOtBQds6hlCODMyiUKPcyJf6ZwhNVnUZEhS6qUlcGz/l2/4IMTluBa7f7HEaIA1JXUPM/yu6ICytIXS82ujPrp67lc4JTPOnrE806LoVcVFsUGqDqdbh1EVbw3+vINvdhUnoE3H0WkVnUaR+Hc4zIRNwCvHsnud0VnKA8JKXXktGzVRDYYCYO8gWZduZd+qptF0hHOKEDr6hPTWE8FBz95f/RI/N7VVXzw3Lv1YiKMTsHWwEWdGLU7El86NatDC8PH4t/x6vJd3adAIFjCo29jRvzhIB5sjkc55LCrYcmPOEf3RpWs4KuDExTPaPUnDX70kbo/jmSIw5YDWL7QBVCuI6a6ogo/FVa6Z4i/e0lllKYFYo/Np+BmCT2Gf53dtIDEYS+pPJ/B57Kbt79EQTgs3fDn5ZND/rwVdnRDbWridgY1a9FJvD4AYXejA25hiwY+sca/OekZ4NTpAycHi9iFm48ymJx0Qm8U36q34Z0Bw6iscLxeZcWvvd5lZSr0AGxcARSmv4vtY0BoEZCGdExaZl0sno6ZJSW82ZC+1l+Mba2qygS+zDDwnLu1EweycKsPp0qHbeVvMek/BQzhPdetjfrV5ju0Nf7zCaZIDXqVseKqKfDzMNQP33HL2El744mBZK2et4ojPpSreSFnxLHjHGnr1qxMiuk6NK02TFqFSdJEFXPvq66v8CPzvoECHV4RI1WUeL92POBvHvsGnRYbPT9XLPnaUr7ImEH2B9LYngdLq5D3cvcjSseO6hIb55BUi4lZgO3KvegWbPxluV9ZS9ZOihz714spM1GHnMlGpYEQQDeYL642cnzN1d55mGC61bblbwrwFwI/Hvl2XhfO0Mqwr+qx8yjXNKwBwZOsc31w9EhaMLKXMDDVbWSFm3NthmXaf4c2R5yNmaqVvNJ0OFsv/eDwDqApN67QMSMyZ7i46A0C+AwEWIw5uFScJCudwIRZd4l70Wbtkm0IdAgODC8++1gUTCCWnohY2Q4CeIQZEi3FcC+BdvFuv1iaMuFSo9aJwBmBNEVxK8pJTveUzk5m3kWkdmEz+UZMgrSeWtZeYDlQ1NWUvGT1aIbhQSBK1QqmN/pUTC1Yq8y9klCln7LJBZOvnVG4oyk9fxxw8nb2tClXjHXzYMMSRQ+w4Y8pqBrbxdNs09gxPI93kbz9TlHKNV4VCOOCbzgYRN/enwM30+Faug8srPkIXo1YlAk41ZHyo0CAPGRDgktKOLGVvDJBSPKTfZevMFdgPzdhleSt9t1jlTkb/JEV0hgjG57ulx9iGTbQA2FLGkiAiDkDPeKaOzyUOVls2zYZ4O663bh2NxTRpNC71ARA5GbyWjaRTzWyXANpYIxEr6831OCayAQRTBB+2p/nN3Mr7fUgz9SGgYFXGEwKN1s/xP/47AoQWLq3ICBLHP7W7yQ0MDkYKvInZuqqNlwVOQUVqDbVEJPsBPalcjoXOJ/HlDyZqRet5CqNdLyqb0rwOfkwcrdlqO2LScWsJrHnTIjT6AaChVtiA56xeTqmGIFrLGTi3ielQqtWpQcCEH64kRNmT4ehBzCOlvXPkDWSwlncr4IfOPv4H215DC+ns1dBC9E0+2T7c5SdavI15v53MdNuunbNcf35f4dkfWRIMy/rsCf30prFvRP2KnuA5Ru6de0hxQF1UuK1eaYIvrME6TV5Vcc4reiaWfI2LG+2hreD1oT+IG0WOOMwO8SUtglCaUVyZ9ienHU31uZ7bAjdkqyjT8AIUQrQ2MCNQJCdgExs7qIbAeGKE04VCgKyt8EEgEGlzrkERnuW03LtIaVZrG05Y0+VlDY+/5HNrccSQu35o4cuYcDnLlqbE9PRu9G/qB4E53wDqVOJgKR04Y8MQhRNVK2K4dFI+F3AoJe6z4oVRiQXvOchnHqSmc9TAnkUu0zVC1c+0vCS6daRyqrOHi5ZDf2KgT156NgkVEpUlS4R7U7+zudv6lSth9iGxoG7cqlBwek+4KEUMb7On6rZuB5ucLj4TaOkM6CGwloVwDBh5G+Jv7pXc3G1PwNExa4Y6MsaNkWWtRwGPtCZ/pEHbfroQRQiPd7mJp7hsErAxEtgitMkYge24h3fzHPg6lp6ueqt5KPkQncLDV9tgWXJsB9BdGBQwoiCfjKnyQIZSnRoqAUt+lK2bQMSIXvUucxCqZ2yDJvoKlcW2S9kADxYvjxEpb1S3byHhsvwFlgYfZtd7mXiwz1K9KT/AmR0gpDiwgNQXjAv3pBphRhWj+OaMa4pKniz7nIPwZMPCHzXlGMAC/o/xB9etyjiVpNQA/pFhuHiGqwTRdbqKi5owYkVpA0PlZBTeKNDVFRdhZ4+GwUJKT09xNK9ibvcTLOeliNaQ+8mm2wiXep6CZWgw537GiFh1KjWUwUY6tPxzxzuOmMfxsiqelaZslR38aBHs4VYL8TpSplOt7wTUhPoPKNk5tUYl9C7y7lGcChdR879xE7EOgE2WtULL5SubLkyua4eAnMyozaWacXgBHbfepbvB2RHy17vtt78ysRQDYO/bqyDA7BQJwzL8jNtSbHFoozV8RIbv7R5D5JWi5v8Cs5GiW2JGIeD8MwtQcAYoOzNMQzdAq3POyLOes9viyE5rKdHIZUKQ4v1xOZxreOy3fFTgNXJsUpQMKJF4ICaY8DFpWwjejStMejVkuilPC1vyEry1/degBp219BvPWhDrwwXh9YpwwKJhrJcg57fDHJ7/P34DXg/8zLGqdq8TDhqHOEc/TAYXLmqsG75RwN2j4KAhYS+EIdS5dvdHyszGGLm+FzBHduZKToOKuFqtjzTbgawM5NsYC5CLwxpjr8QIQctdUOJsmQ+6Y2LTjhSlDbA1oVMSom80We/3Ae0BDXJSWPAqxTOEw2ptLwtIj1lGM4QnUOMcMEqda0Jfqn/1etsbpaNELZQ==", + "ctl00$MainContent$ddlSearchOpinions1_Year": current_year, + "ctl00$MainContent$ddlSearchOpinions2_Year": current_year, + "ctl00$MainContent$ddlSearchOpinions2_Month": current_month, + } + self.use_proxy = True + self.make_backscrape_iterable(kwargs) + + def _process_html(self) -> None: + """Process the HTML to extract case details. + + :return None + """ + + rows = self.html.xpath( + "(//table[contains(@class, 'table table-striped table-responsive')])[6]//tr" + ) + + for row in rows: + hrefs = row.xpath( + ".//a[contains(text(), 'Download Opinion')]/@href" + ) + if not hrefs: + continue + download_url = urljoin(self.base_url, hrefs[0]) + docket = row.xpath(".//strong[1]/text()")[0] + date_raw = row.xpath( + ".//strong[contains(text(), 'Opinion Date:')]/following-sibling::text()[1]" + )[0].strip() + date = f"{date_raw[:2]}/{date_raw[2:4]}/{date_raw[4:]}" + name = row.xpath( + ".//strong[contains(text(), 'Case Title:')]/following-sibling::text()[1]" + )[0].strip() + lower_court = row.xpath( + ".//strong[contains(text(), 'Lower Court:')]/following-sibling::text()[1]" + )[0].strip() + + self.cases.append( + { + "docket": docket, + "date": date, + "name": titlecase(name), + "lower_court": lower_court, + "url": download_url, + } + ) + + def _download_backwards(self, search_date: date) -> None: + """Download and process HTML for a given target date. + + :param search_date (date): The date for which to download and process opinions. + :return None; sets the target date, downloads the corresponding HTML + and processes the HTML to extract case details. + """ + + self.parameters.update( + { + "ctl00$MainContent$ddlSearchOpinions1_Year": str( + search_date.year + ), + "ctl00$MainContent$ddlSearchOpinions2_Year": str( + search_date.year + ), + "ctl00$MainContent$ddlSearchOpinions2_Month": search_date.strftime( + "%B" + ), + } + ) + self.html = self._download() + self._process_html() + + def make_backscrape_iterable(self, kwargs) -> None: + """Make back scrape iterable + + :param kwargs: the back scraping params + :return: None + """ + super().make_backscrape_iterable(kwargs) + self.back_scrape_iterable = unique_year_month( + self.back_scrape_iterable + ) diff --git a/juriscraper/opinions/united_states/state/minn.py b/juriscraper/opinions/united_states/state/minn.py index 45b6d0644..492b41a5e 100644 --- a/juriscraper/opinions/united_states/state/minn.py +++ b/juriscraper/opinions/united_states/state/minn.py @@ -30,11 +30,12 @@ def __init__(self, *args, **kwargs): self.status = "Unpublished" self.url = "https://mn.gov/law-library/search/" - self.params = self.base_params = { + self.params = { "v:sources": "mn-law-library-opinions", "query": f" (url:/archive/{self.court_query}) ", "sortby": "date", } + self.request["verify"] = False self.request["headers"] = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", @@ -47,16 +48,16 @@ def __init__(self, *args, **kwargs): "Referer": "https://mn.gov/law-library/search/?v%3Asources=mn-law-library-opinions&query=+%28url%3A%2Farchive%2Fsupct%29+&citation=&qt=&sortby=&docket=&case=&v=&p=&start-date=&end-date=", "Connection": "keep-alive", } + self.request["parameters"]["params"] = self.params self.make_backscrape_iterable(kwargs) self.needs_special_headers = True + # self.use_proxy = True def _process_html(self) -> None: """Process the html and extract out the opinions :return: None """ - self.html = self._download({"params": self.params}) - # This warning is useful for backscraping results_number = self.html.xpath( "//div[@class='searchresult_number']/text()" @@ -123,7 +124,7 @@ def _process_html(self) -> None: def _download_backwards(self, dates: tuple[date]): logger.info("Backscraping for range %s - %s", *dates) - params = {**self.base_params} + params = {**self.params} params.update( { "start-date": dates[0].strftime("%-m/%-d/%Y"), @@ -132,3 +133,4 @@ def _download_backwards(self, dates: tuple[date]): } ) self.params = params + self.request["parameters"]["params"] = self.params diff --git a/juriscraper/opinions/united_states/state/miss.py b/juriscraper/opinions/united_states/state/miss.py index 5039d3b80..da391a088 100644 --- a/juriscraper/opinions/united_states/state/miss.py +++ b/juriscraper/opinions/united_states/state/miss.py @@ -1,102 +1,55 @@ # Court Contact: bkraft@courts.ms.gov (see https://courts.ms.gov/aoc/aoc.php) +from datetime import date, timedelta +from urllib.parse import urljoin -import datetime - -from juriscraper.lib.string_utils import convert_date_string from juriscraper.OpinionSiteLinear import OpinionSiteLinear -# Landing page: https://courts.ms.gov/appellatecourts/sc/scdecisions.php class Site(OpinionSiteLinear): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.domain = "https://courts.ms.gov" self.court_id = self.__module__ - self.method = "POST" - self.number_of_dates_to_process = 5 - self.pages = {} - self.parameters = {"crt": self.get_court_parameter()} + self.method = "GET" + self.publish_date = "09/11/2025" self.status = "Published" - self.url = f"{self.domain}/appellatecourts/docket/gethddates.php" - - def get_court_parameter(self): - return "SCT" - - """Retrieve dates for which there are case listings. - This site's architecture is no bueno. We have to issue - a POST request to this page to get a array (in the form - of a string) or dates that have cases associated with - them. - """ - - def _download(self, request_dict=None): - if request_dict is None: - request_dict = {} - dates_page = super()._download(request_dict) - self.parse_date_pages(dates_page) - - """Keep track of the most recent N date pages. - We dont want to crawl all the way back to 1996, so we only - parse the most recent [self.number_of_dates_to_process] - number of date pages. Since cases are usually published - once a week, this means scraping about the most recent - months worth of cases. - """ - - def parse_date_pages(self, dates_page): - # For testing, each example file should be a specific sub-date page, - # like https://courts.ms.gov/Images/HDList/SCT02-27-2020.html - if self.test_mode_enabled(): - # date below is arbitrary and doesnt matter, it just - # needs to be static for testing to work - self.pages["2020-02-28"] = dates_page - return - for date in self.get_dates_from_date_page(dates_page): - url = "{}/Images/HDList/SCT{}.html".format( - self.domain, - datetime.date.strftime(date, "%m-%d-%Y"), - ) - page = self._get_html_tree_by_url(url) - self.pages[f"{date}"] = page - - """Convert string of dates on page into list of date objects. - """ - - def get_dates_from_date_page(self, dates_page): - dates = [] - substrings = dates_page.text_content().split('"') - for substring in substrings: - try: - dates.append(convert_date_string(substring)) - except ValueError: - pass - dates.sort(reverse=True) - return dates[: self.number_of_dates_to_process] + self.url = f"https://courts.ms.gov/appellatecourts/sc/scdecisions.php?date={self.publish_date}" + self.use_proxy = True + self.additional_params = { + "wait_for": "#dispAreaHD > p:nth-child(2) > a" + } + + @staticmethod + def most_recent_release_date(day: int): + """""" + delta = (date.today().weekday() - day) % 7 + return (date.today() - timedelta(days=delta or 7)).strftime("%m/%d/%Y") def _process_html(self): - for date, page in self.pages.items(): - for anchor in page.xpath(".//a[contains(./@href, '.pdf')]"): - parent = anchor.getparent() - - # sometimes the first opinion on the pages is nested - # in a
tag for whatever reason. - while parent.getparent().tag != "body": - parent = parent.getparent() - - sections = parent.xpath("./following-sibling::ul") - if not sections: - # the while loop above should mean we never fall in here - continue - - section = sections[0] - self.cases.append( - { - "date": date, - "docket": anchor.text_content().strip(), - "name": section.xpath(".//b")[0] - .text_content() - .strip(), - "summary": section.text_content().strip(), - "url": anchor.xpath("./@href")[0], - } + """Process the html + + :return: None + """ + for link in self.html.xpath( + "//div[@id='dispAreaHD']//a[contains(@href, '.pdf')]" + ): + slug = link.xpath("./@href")[0] + if not slug.startswith("http"): + slug = urljoin( + "https://courts.ms.gov/images/", + slug[3:].replace("\\", "/"), ) + ul_nodes = link.xpath("./following::ul[1]") + if not ul_nodes: + continue + self.cases.append( + { + "date": self.publish_date, + "docket": link.text_content().strip(), + "name": ul_nodes[0] + .xpath(".//b")[0] + .text_content() + .strip(), + "summary": ul_nodes[0].text_content().strip(), + "url": slug, + } + ) diff --git a/juriscraper/opinions/united_states/state/nm.py b/juriscraper/opinions/united_states/state/nm.py index a4babae39..eec991ae8 100644 --- a/juriscraper/opinions/united_states/state/nm.py +++ b/juriscraper/opinions/united_states/state/nm.py @@ -1,7 +1,7 @@ import re from datetime import date, datetime, timedelta from typing import Any, Optional -from urllib.parse import urlencode +from urllib.parse import urlencode, urljoin from juriscraper.AbstractSite import logger from juriscraper.lib.string_utils import titlecase @@ -17,7 +17,7 @@ class Site(OpinionSiteLinear): Additionally, we moved docket number capture to PDF extraction, to limit the number of requests. """ - base_url = "https://nmonesource.com/nmos/en/d/s/index.do" + base_url = "https://nmonesource.com/" court_code = "182" first_opinion_date = datetime(1900, 1, 1) days_interval = 15 @@ -27,6 +27,7 @@ def __init__(self, *args, **kwargs): self.court_id = self.__module__ self.set_url() self.make_backscrape_iterable(kwargs) + self.use_proxy = True def _process_html(self) -> None: """Parse HTML into case dictionaries @@ -46,6 +47,8 @@ def _process_html(self) -> None: url = row.xpath( ".//a[contains(@title, 'Download the PDF version')]/@href" )[0] + url = urljoin(self.base_url, url) + name = row.xpath(".//span[@class='title']/a/text()")[0] date_filed = row.xpath(".//span[@class='publicationDate']/text()")[ 0 @@ -106,7 +109,11 @@ def set_url( "or": "date", "iframe": "true", } - self.url = f"{self.base_url}?{urlencode(params)}" + + self.url = ( + urljoin(self.base_url, "nmos/en/d/s/index.do") + + f"?{urlencode(params)}" + ) def _download_backwards(self, dates: tuple[date]) -> None: """Make custom date range request diff --git a/juriscraper/opinions/united_states/state/okla.py b/juriscraper/opinions/united_states/state/okla.py index 8b6d5e333..f93effa81 100644 --- a/juriscraper/opinions/united_states/state/okla.py +++ b/juriscraper/opinions/united_states/state/okla.py @@ -22,6 +22,7 @@ def __init__(self, *args, **kwargs): self.status = "Published" self.expected_content_types = ["text/html"] self.should_have_results = True + self.use_proxy = True def _process_html(self): for row in self.html.xpath(".//li[@class='decision']"): diff --git a/tests/examples/opinions/united_states/lactapp_3_example.compare.json b/tests/examples/opinions/united_states/lactapp_3_example.compare.json new file mode 100644 index 000000000..45b9744eb --- /dev/null +++ b/tests/examples/opinions/united_states/lactapp_3_example.compare.json @@ -0,0 +1,145 @@ +[ + { + "case_dates": "2025-10-01", + "case_names": "State of Louisiana Versus Reginald Ricardo Parnell Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMTM0b3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "KA -0025-0134", + "lower_courts": "Ninth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "State of Louisiana Versus Marcus Chenier Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNC0wNDU3b3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "KA -0024-0457", + "lower_courts": "Twenty-Seventh Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "State of Louisiana Versus Jevon Noah Figaro Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNC0wNDU2b3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "KA -0024-0456", + "lower_courts": "Twenty-Seventh Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "State in the Interest of M. P.,l. H., I. H., and S.W. Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMTM5b3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "JAC-0025-0139", + "lower_courts": "Thirty-Fifth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Shannon Driver and Jason Boothe Versus Jack F. Owens, Jr. Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMDc5b3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0025-0079", + "lower_courts": "Seventh Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Marie Howerton Versus Linda Howerton & David st.louis Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMTY3b3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0025-0167", + "lower_courts": "Fourteenth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Marcus Hebert Versus Liberty Mutual Insurance Company Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMTMwb3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0025-0130", + "lower_courts": "Fifteenth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Louisiana Energy Gateway, LLC Versus Etc Texas Pipeline, Ltd. Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMDMxb3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0025-0031", + "lower_courts": "Thirtieth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Joseph D. Mills Versus Mmg Archer Institute, LLC Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMDczb3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0025-0073", + "lower_courts": "Fourteenth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Christopher L. Clement, Sr. and Sherry H. Clement Versus State of Louisiana Through the Department of Transportation and Development, Parish of Lafayette and City of Lafayette Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNC0wMjU2b3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0024-0256", + "lower_courts": "Fifteenth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Christian D. Chesson, Plc Versus Logan Ryan Gandy Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMDU3b3BpLnBkZg==", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0025-0057", + "lower_courts": "Fourteenth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Alan Perego, as Independent Testamentary of the Estate of Ora Ann Hawkins Perego Versus Kenneth L. Perego II Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMTA2bnAucGRm", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0025-0106", + "lower_courts": "Ninth Judicial District Court", + "case_name_shorts": "" + }, + { + "case_dates": "2025-10-01", + "case_names": "Acadiana Renal Physicians, Amc Versus Dr. Rapheal Higginbotham, M.D., Dr. Michael Liu, M.D., Dr. Masoud Yazdi, M.D. Vs.", + "download_urls": "https://www.la3circuit.org/tests/examples/opinions/united_states/transmit.aspx?id=XFxBUkNISVZFU1xBcHBlYWxzXENhc2VBcmNoaXZlc1xPcGluaW9uc1wyMDI1XDEwXDEwMDEyNVwyNS0wMTE0bnAucGRm", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "CA -0025-0114", + "lower_courts": "Fifteenth Judicial District Court", + "case_name_shorts": "" + } +] \ No newline at end of file diff --git a/tests/examples/opinions/united_states/lactapp_3_example.html b/tests/examples/opinions/united_states/lactapp_3_example.html new file mode 100644 index 000000000..8546f71e7 --- /dev/null +++ b/tests/examples/opinions/united_states/lactapp_3_example.html @@ -0,0 +1,3878 @@ + + + + + + +