Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Releases are also tagged in git, if that's helpful.
The following changes are not yet released, but are code complete:

Features:
-
- add a residential proxy to AbstractSite to help with sites that block known data center IPs #1616

Changes:
-
Expand Down
100 changes: 72 additions & 28 deletions juriscraper/AbstractSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
from datetime import date, datetime, timedelta
from typing import Union
from urllib.parse import urlencode

import certifi
import requests
Expand Down Expand Up @@ -50,6 +51,7 @@ def __init__(self, cnt=None, **kwargs):
super().__init__()

# Computed metadata
self.additional_params = None
self.hash = None
self.html = None
self.method = "GET"
Expand All @@ -72,6 +74,9 @@ def __init__(self, cnt=None, **kwargs):
"status": None,
"url": None,
}
self.use_proxy = False
self.SCRAPINGBEE_API_KEY = os.environ.get("SCRAPINGBEE_API_KEY", None)
self.SCRAPINGBEE_API_URL = os.environ.get("SCRAPINGBEE_API_URL", None)

# Attribute to reference a function passed by the caller,
# which takes a single argument, the Site object, after
Expand Down Expand Up @@ -385,10 +390,10 @@ def _download(self, request_dict=None):

if self.test_mode_enabled():
self._request_url_mock(self.url)
elif self.method == "GET":
self._request_url_get(self.url)
elif self.method == "POST":
self._request_url_post(self.url)
elif self.use_proxy:
self._request_url_via_proxy(self.url)
else:
self._request_url(self.url)

self._post_process_response()
return self._return_response_text_object()
Expand Down Expand Up @@ -434,13 +439,28 @@ def download_content(

# Note that we do a GET even if self.method is POST. This is
# deliberate.
r = s.get(
download_url,
verify=has_cipher, # WA has a certificate we don't understand
headers=headers,
cookies=self.cookies,
timeout=300,
)
if self.use_proxy:
params = {
"api_key": self.SCRAPINGBEE_API_KEY,
"url": download_url,
"render_js": "false",
"cookies": self.cookies,
"country_code": "us",
"premium_proxy": "true",
}

r = s.get(
self.SCRAPINGBEE_API_URL,
params=params,
)
else:
r = s.get(
download_url,
verify=has_cipher, # WA has a certificate we don't understand
headers=headers,
cookies=self.cookies,
timeout=300,
)

# test for empty files (thank you CA1)
if len(r.content) == 0:
Expand Down Expand Up @@ -494,32 +514,56 @@ def _process_request_parameters(self, parameters=None):
del parameters["verify"]
self.request["parameters"].update(parameters)

def _request_url_get(self, url):
"""Execute GET request and assign appropriate request dictionary
values
"""
def _request_url(self, url):
"""Execute GET or POST request and assign appropriate request dictionary values"""
self.request["url"] = url
self.request["response"] = self.request["session"].get(
url,
session = self.request["session"]
request_args = dict(
url=url,
headers=self.request["headers"],
verify=self.request["verify"],
timeout=60,
**self.request["parameters"],
)
if self.method == "POST":
request_args["data"] = self.parameters
self.request["response"] = session.post(**request_args)
else:
self.request["response"] = session.get(**request_args)
if self.save_response:
self.save_response(self)

def _request_url_post(self, url):
"""Execute POST request and assign appropriate request dictionary values"""
self.request["url"] = url
self.request["response"] = self.request["session"].post(
url,
headers=self.request["headers"],
verify=self.request["verify"],
data=self.parameters,
timeout=60,
**self.request["parameters"],
)
def _request_url_via_proxy(self, url):
if not self.SCRAPINGBEE_API_KEY or not self.SCRAPINGBEE_API_URL:
raise RuntimeError(
"SCRAPINGBEE_API_KEY and SCRAPINGBEE_API_URL not set in environment."
)

if self.request["parameters"].get("params"):
self.url += "?" + urlencode(self.request["parameters"]["params"])

base_proxy_params = {
"api_key": self.SCRAPINGBEE_API_KEY,
"url": url,
"premium_proxy": "true",
"country_code": "us",
"block_resources": "false",
}
if self.additional_params:
base_proxy_params.update(self.additional_params)

if self.method == "POST":
self.request["response"] = self.request["session"].post(
self.SCRAPINGBEE_API_URL,
params=base_proxy_params,
data=self.parameters,
)
else:
self.request["response"] = self.request["session"].get(
self.SCRAPINGBEE_API_URL,
params=base_proxy_params,
)

if self.save_response:
self.save_response(self)

Expand Down
10 changes: 6 additions & 4 deletions juriscraper/opinions/united_states/state/ark.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,16 @@
import re
from datetime import date, datetime, timedelta
from typing import Any, Optional
from urllib.parse import urlencode
from urllib.parse import urlencode, urljoin

from juriscraper.AbstractSite import logger
from juriscraper.lib.string_utils import normalize_dashes, titlecase
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
base_url = "https://opinions.arcourts.gov/ark/en/d/s/index.do"
base_url = "https://opinions.arcourts.gov"
base_endpoint = urljoin(base_url, "/ark/en/d/s/index.do")
court_code = "144"
cite_regex = re.compile(r"\d{2,4} Ark\. \d+", re.IGNORECASE)
first_opinion_date = datetime(1979, 9, 3)
Expand All @@ -25,6 +26,7 @@ def __init__(self, *args, **kwargs):
self.court_id = self.__module__
self.set_url()
self.make_backscrape_iterable(kwargs)
self.use_proxy = True

def _process_html(self) -> None:
"""Parse HTML into case dictionaries
Expand All @@ -44,7 +46,7 @@ def _process_html(self) -> None:
per_curiam = False

name = item.xpath(".//a/text()")[0]
url = item.xpath(".//a/@href")[1]
url = urljoin(self.base_url, item.xpath(".//a/@href")[1])
if re.search(self.not_a_opinion_regex, name.upper()):
logger.info("Skipping %s %s, invalid document", name, url)
continue
Expand Down Expand Up @@ -96,7 +98,7 @@ def set_url(
"or": "date",
"iframe": "true",
}
self.url = f"{self.base_url}?{urlencode(params)}"
self.url = urljoin(self.base_endpoint, f"?{urlencode(params)}")

def extract_from_text(self, scraped_text: str) -> dict[str, Any]:
"""Pass scraped text into function and return data as a dictionary
Expand Down
10 changes: 6 additions & 4 deletions juriscraper/opinions/united_states/state/minn.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,12 @@ def __init__(self, *args, **kwargs):
self.status = "Unpublished"

self.url = "https://mn.gov/law-library/search/"
self.params = self.base_params = {
self.params = {
"v:sources": "mn-law-library-opinions",
"query": f" (url:/archive/{self.court_query}) ",
"sortby": "date",
}

self.request["verify"] = False
self.request["headers"] = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
Expand All @@ -47,16 +48,16 @@ def __init__(self, *args, **kwargs):
"Referer": "https://mn.gov/law-library/search/?v%3Asources=mn-law-library-opinions&query=+%28url%3A%2Farchive%2Fsupct%29+&citation=&qt=&sortby=&docket=&case=&v=&p=&start-date=&end-date=",
"Connection": "keep-alive",
}
self.request["parameters"]["params"] = self.params
self.make_backscrape_iterable(kwargs)
self.needs_special_headers = True
# self.use_proxy = True

def _process_html(self) -> None:
"""Process the html and extract out the opinions

:return: None
"""
self.html = self._download({"params": self.params})

# This warning is useful for backscraping
results_number = self.html.xpath(
"//div[@class='searchresult_number']/text()"
Expand Down Expand Up @@ -123,7 +124,7 @@ def _process_html(self) -> None:

def _download_backwards(self, dates: tuple[date]):
logger.info("Backscraping for range %s - %s", *dates)
params = {**self.base_params}
params = {**self.params}
params.update(
{
"start-date": dates[0].strftime("%-m/%-d/%Y"),
Expand All @@ -132,3 +133,4 @@ def _download_backwards(self, dates: tuple[date]):
}
)
self.params = params
self.request["parameters"]["params"] = self.params
131 changes: 42 additions & 89 deletions juriscraper/opinions/united_states/state/miss.py
Original file line number Diff line number Diff line change
@@ -1,102 +1,55 @@
# Court Contact: bkraft@courts.ms.gov (see https://courts.ms.gov/aoc/aoc.php)
from datetime import date, timedelta
from urllib.parse import urljoin

import datetime

from juriscraper.lib.string_utils import convert_date_string
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


# Landing page: https://courts.ms.gov/appellatecourts/sc/scdecisions.php
class Site(OpinionSiteLinear):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.domain = "https://courts.ms.gov"
self.court_id = self.__module__
self.method = "POST"
self.number_of_dates_to_process = 5
self.pages = {}
self.parameters = {"crt": self.get_court_parameter()}
self.method = "GET"
self.publish_date = "09/11/2025"
self.status = "Published"
self.url = f"{self.domain}/appellatecourts/docket/gethddates.php"

def get_court_parameter(self):
return "SCT"

"""Retrieve dates for which there are case listings.
This site's architecture is no bueno. We have to issue
a POST request to this page to get a array (in the form
of a string) or dates that have cases associated with
them.
"""

def _download(self, request_dict=None):
if request_dict is None:
request_dict = {}
dates_page = super()._download(request_dict)
self.parse_date_pages(dates_page)

"""Keep track of the most recent N date pages.
We dont want to crawl all the way back to 1996, so we only
parse the most recent [self.number_of_dates_to_process]
number of date pages. Since cases are usually published
once a week, this means scraping about the most recent
months worth of cases.
"""

def parse_date_pages(self, dates_page):
# For testing, each example file should be a specific sub-date page,
# like https://courts.ms.gov/Images/HDList/SCT02-27-2020.html
if self.test_mode_enabled():
# date below is arbitrary and doesnt matter, it just
# needs to be static for testing to work
self.pages["2020-02-28"] = dates_page
return
for date in self.get_dates_from_date_page(dates_page):
url = "{}/Images/HDList/SCT{}.html".format(
self.domain,
datetime.date.strftime(date, "%m-%d-%Y"),
)
page = self._get_html_tree_by_url(url)
self.pages[f"{date}"] = page

"""Convert string of dates on page into list of date objects.
"""

def get_dates_from_date_page(self, dates_page):
dates = []
substrings = dates_page.text_content().split('"')
for substring in substrings:
try:
dates.append(convert_date_string(substring))
except ValueError:
pass
dates.sort(reverse=True)
return dates[: self.number_of_dates_to_process]
self.url = f"https://courts.ms.gov/appellatecourts/sc/scdecisions.php?date={self.publish_date}"
self.use_proxy = True
self.additional_params = {
"wait_for": "#dispAreaHD > p:nth-child(2) > a"
}

@staticmethod
def most_recent_release_date(day: int):
""""""
delta = (date.today().weekday() - day) % 7
return (date.today() - timedelta(days=delta or 7)).strftime("%m/%d/%Y")

def _process_html(self):
for date, page in self.pages.items():
for anchor in page.xpath(".//a[contains(./@href, '.pdf')]"):
parent = anchor.getparent()

# sometimes the first opinion on the pages is nested
# in a <p> tag for whatever reason.
while parent.getparent().tag != "body":
parent = parent.getparent()

sections = parent.xpath("./following-sibling::ul")
if not sections:
# the while loop above should mean we never fall in here
continue

section = sections[0]
self.cases.append(
{
"date": date,
"docket": anchor.text_content().strip(),
"name": section.xpath(".//b")[0]
.text_content()
.strip(),
"summary": section.text_content().strip(),
"url": anchor.xpath("./@href")[0],
}
"""Process the html

:return: None
"""
for link in self.html.xpath(
"//div[@id='dispAreaHD']//a[contains(@href, '.pdf')]"
):
slug = link.xpath("./@href")[0]
if not slug.startswith("http"):
slug = urljoin(
"https://courts.ms.gov/images/",
slug[3:].replace("\\", "/"),
)
ul_nodes = link.xpath("./following::ul[1]")
if not ul_nodes:
continue
self.cases.append(
{
"date": self.publish_date,
"docket": link.text_content().strip(),
"name": ul_nodes[0]
.xpath(".//b")[0]
.text_content()
.strip(),
"summary": ul_nodes[0].text_content().strip(),
"url": slug,
}
)
Loading
Loading