Skip to content
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ The following changes are not yet released, but are code complete:

Features:
- implement `neb` and `nebctapp` extract_from_text #1549
- add new flag `--save-for-manual-upload` to `sample_caller` to save
the scraped data for manual upload to CourtListener.

Changes:
-
Expand Down
59 changes: 54 additions & 5 deletions sample_caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,6 @@ def extract_doc_content(
the extracted content
the structured metadata parsed by Site.extract_from_text
"""
if not extract_from_text:
return data, {}

# Get the file type from the document's raw content
extension_url = MICROSERVICE_URLS["buffer-extension"].format(doctor_host)
Expand All @@ -117,6 +115,9 @@ def extract_doc_content(
extension_response.raise_for_status()
extension = extension_response.text

if not extract_from_text:
return data, {}, extension

files = {"file": (f"something.{extension}", data)}
url = MICROSERVICE_URLS["document-extract"].format(doctor_host)
extraction__response = requests.post(url, files=files, timeout=120)
Expand Down Expand Up @@ -156,7 +157,7 @@ def extract_doc_content(
logger.info("\nOpen extracted content with 'file://%s'", filepath)

metadata_dict = site.extract_from_text(extracted_content)
return extracted_content, metadata_dict
return extracted_content, metadata_dict, extension


def get_binary_content(download_url: str, site, exceptions) -> bytes:
Expand Down Expand Up @@ -239,13 +240,49 @@ def check_hashes(data: bytes, download_url: str, site) -> None:
logger.info("Same URL hashes are the same. It's OK")


def download_item(data: bytes, item, download_url: str, site, extension: str):
"""Save each case's metadata and content for manual upload."""

# Create a folder named after the court_id
folder_name = os.path.join(
os.path.expanduser("~"), "Downloads", site.court_id.replace(".", "_")
)
os.makedirs(folder_name, exist_ok=True)

file_hash = hashlib.sha256(force_bytes(download_url)).hexdigest()
json_path = os.path.join(folder_name, f"{file_hash}.json")
# Ensure extension starts with a dot
ext = extension if extension.startswith(".") else f".{extension}"
content_path = os.path.join(folder_name, f"{file_hash}{ext}")

# Save metadata
with open(json_path, "w") as f:
item_parsed = item.copy()
if "case_dates" in item_parsed:
try:
item_parsed["case_dates"] = item_parsed["case_dates"].strftime(
"%Y%m%d"
)
except (AttributeError, TypeError):
pass
json.dump(item_parsed, f, indent=2)
# Save content
if isinstance(data, (bytes, bytearray)):
with open(content_path, "wb") as f:
f.write(data)
else:
with open(content_path, "w", encoding="utf-8") as f:
f.write(data)


def scrape_court(
site,
binaries=False,
extract_content=False,
doctor_host="",
test_hashes: bool = False,
limit: int = 1000,
save_for_manual_upload: bool = False,
):
"""Calls the requested court(s), gets its binary content, and
extracts the content if possible. See --extract-content option
Expand Down Expand Up @@ -297,7 +334,7 @@ def scrape_court(

filename = item["case_names"].lower().replace(" ", "_")[:40]

data, metadata_from_text = extract_doc_content(
data, metadata_from_text, extension = extract_doc_content(
data, extract_content, site, doctor_host, filename
)
logger.log(
Expand All @@ -316,6 +353,9 @@ def scrape_court(
# Separate cases for easier reading when verbosity=DEBUG
logger.debug("\n%s\n", "=" * 60)

if save_for_manual_upload:
download_item(data, item, download_url, site, extension)

logger.info(
"\n%s: Successfully crawled %s items.", site.court_id, len(site)
)
Expand Down Expand Up @@ -487,6 +527,13 @@ def main():
help="How many items to scrape per `scrape_court` call",
)

parser.add_option(
"--save-for-manual-upload",
action="store_true",
default=False,
help="Save each case's metadata and content for manual upload. Files are named with a hash and stored in a folder named after the court_id.",
)

(options, args) = parser.parse_args()

court_id = options.court_id
Expand All @@ -501,8 +548,9 @@ def main():
save_responses = options.save_responses
test_hashes = options.test_hashes
limit_per_scrape = options.limit_per_scrape
save_for_manual_upload = options.save_for_manual_upload

if test_hashes:
if test_hashes or save_for_manual_upload:
binaries = True

if extract_content:
Expand Down Expand Up @@ -572,6 +620,7 @@ def main():
doctor_host,
test_hashes,
limit_per_scrape,
save_for_manual_upload,
)

logger.debug("The scraper has stopped.")
Expand Down
Loading