From 56af059455abc64ddaff5d01528ce47c527d0768 Mon Sep 17 00:00:00 2001 From: luism Date: Fri, 8 Aug 2025 12:00:01 -0400 Subject: [PATCH 1/7] feat(scraper): add option to save case metadata and content for manual upload --- CHANGES.md | 3 ++- sample_caller.py | 63 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 3c995ad80..6332861cc 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -15,7 +15,8 @@ Releases are also tagged in git, if that's helpful. The following changes are not yet released, but are code complete: Features: -- +- add new flag `--save-for-manual-upload` to `sample_caller` to save + the scraped data for manual upload to CourtListener. Changes: - diff --git a/sample_caller.py b/sample_caller.py index e72161d79..9e68de16d 100755 --- a/sample_caller.py +++ b/sample_caller.py @@ -1,6 +1,7 @@ import hashlib import json import logging +import mimetypes import os import re import signal @@ -239,6 +240,53 @@ def check_hashes(data: bytes, download_url: str, site) -> None: logger.info("Same URL hashes are the same. It's OK") +def download_item(data: bytes, item, download_url: str, site): + """Save each case's metadata and content for manual upload.""" + + # Create a folder named after the court_id + folder_name = f"./{site.court_id.replace('.', '_')}" + os.makedirs(folder_name, exist_ok=True) + + # Get extension from response headers if possible + ext = None + if "Content-Type" in site.request["response"].headers: + content_type = ( + site.request["response"] + .headers["Content-Type"] + .split(";")[0] + .strip() + ) + ext = mimetypes.guess_extension(content_type) + if not ext: + # Fallback: try to get extension from download_url + ext = os.path.splitext(parse.unquote(download_url))[1] + if not ext: + ext = ".bin" # default if unknown + + file_hash = sha1(download_url) + json_path = os.path.join(folder_name, f"{file_hash}.json") + content_path = os.path.join(folder_name, f"{file_hash}{ext}") + + # Save metadata + with open(json_path, "w") as f: + item_parsed = item.copy() + if "case_dates" in item_parsed: + try: + item_parsed["case_dates"] = item_parsed["case_dates"].strftime( + "%d/%m/%Y" + ) + except Exception: + pass + json.dump(item_parsed, f, indent=2) + # Save content + if isinstance(data, (bytes, bytearray)): + with open(content_path, "wb") as f: + f.write(data) + else: + with open(content_path, "w", encoding="utf-8") as f: + f.write(data) + + def scrape_court( site, binaries=False, @@ -246,6 +294,7 @@ def scrape_court( doctor_host="", test_hashes: bool = False, limit: int = 1000, + save_for_manual_upload: bool = False, ): """Calls the requested court(s), gets its binary content, and extracts the content if possible. See --extract-content option @@ -273,7 +322,7 @@ def scrape_court( logger.debug("\nAdding new item:") log_dict(item) - if not binaries: + if not binaries and not save_for_manual_upload: continue try: @@ -316,6 +365,9 @@ def scrape_court( # Separate cases for easier reading when verbosity=DEBUG logger.debug("\n%s\n", "=" * 60) + if save_for_manual_upload: + download_item(data, item, download_url, site) + logger.info( "\n%s: Successfully crawled %s items.", site.court_id, len(site) ) @@ -487,6 +539,13 @@ def main(): help="How many items to scrape per `scrape_court` call", ) + parser.add_option( + "--save-for-manual-upload", + action="store_true", + default=False, + help="Save each case's metadata and content for manual upload. Files are named with a hash and stored in a folder named after the court_id.", + ) + (options, args) = parser.parse_args() court_id = options.court_id @@ -501,6 +560,7 @@ def main(): save_responses = options.save_responses test_hashes = options.test_hashes limit_per_scrape = options.limit_per_scrape + save_for_manual_upload = options.save_for_manual_upload if test_hashes: binaries = True @@ -572,6 +632,7 @@ def main(): doctor_host, test_hashes, limit_per_scrape, + save_for_manual_upload, ) logger.debug("The scraper has stopped.") From 9e7a2f13f16186ccb88897f5c7f3887c849d6e7f Mon Sep 17 00:00:00 2001 From: luism Date: Mon, 11 Aug 2025 13:53:00 -0400 Subject: [PATCH 2/7] feat(sample_caller): move extension handling for downloaded documents to doctor --- sample_caller.py | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/sample_caller.py b/sample_caller.py index 9e68de16d..9397e3518 100755 --- a/sample_caller.py +++ b/sample_caller.py @@ -1,7 +1,6 @@ import hashlib import json import logging -import mimetypes import os import re import signal @@ -107,8 +106,6 @@ def extract_doc_content( the extracted content the structured metadata parsed by Site.extract_from_text """ - if not extract_from_text: - return data, {} # Get the file type from the document's raw content extension_url = MICROSERVICE_URLS["buffer-extension"].format(doctor_host) @@ -118,6 +115,9 @@ def extract_doc_content( extension_response.raise_for_status() extension = extension_response.text + if not extract_from_text: + return data, {}, extension + files = {"file": (f"something.{extension}", data)} url = MICROSERVICE_URLS["document-extract"].format(doctor_host) extraction__response = requests.post(url, files=files, timeout=120) @@ -157,7 +157,7 @@ def extract_doc_content( logger.info("\nOpen extracted content with 'file://%s'", filepath) metadata_dict = site.extract_from_text(extracted_content) - return extracted_content, metadata_dict + return extracted_content, metadata_dict, extension def get_binary_content(download_url: str, site, exceptions) -> bytes: @@ -240,32 +240,18 @@ def check_hashes(data: bytes, download_url: str, site) -> None: logger.info("Same URL hashes are the same. It's OK") -def download_item(data: bytes, item, download_url: str, site): +def download_item(data: bytes, item, download_url: str, site, extension: str): """Save each case's metadata and content for manual upload.""" # Create a folder named after the court_id - folder_name = f"./{site.court_id.replace('.', '_')}" + folder_name = os.path.join( + os.path.expanduser("~"), "Downloads", site.court_id.replace(".", "_") + ) os.makedirs(folder_name, exist_ok=True) - # Get extension from response headers if possible - ext = None - if "Content-Type" in site.request["response"].headers: - content_type = ( - site.request["response"] - .headers["Content-Type"] - .split(";")[0] - .strip() - ) - ext = mimetypes.guess_extension(content_type) - if not ext: - # Fallback: try to get extension from download_url - ext = os.path.splitext(parse.unquote(download_url))[1] - if not ext: - ext = ".bin" # default if unknown - file_hash = sha1(download_url) json_path = os.path.join(folder_name, f"{file_hash}.json") - content_path = os.path.join(folder_name, f"{file_hash}{ext}") + content_path = os.path.join(folder_name, f"{file_hash}{extension}") # Save metadata with open(json_path, "w") as f: @@ -273,7 +259,7 @@ def download_item(data: bytes, item, download_url: str, site): if "case_dates" in item_parsed: try: item_parsed["case_dates"] = item_parsed["case_dates"].strftime( - "%d/%m/%Y" + "%Y%m%d" ) except Exception: pass @@ -346,7 +332,7 @@ def scrape_court( filename = item["case_names"].lower().replace(" ", "_")[:40] - data, metadata_from_text = extract_doc_content( + data, metadata_from_text, extension = extract_doc_content( data, extract_content, site, doctor_host, filename ) logger.log( @@ -366,7 +352,7 @@ def scrape_court( logger.debug("\n%s\n", "=" * 60) if save_for_manual_upload: - download_item(data, item, download_url, site) + download_item(data, item, download_url, site, extension) logger.info( "\n%s: Successfully crawled %s items.", site.court_id, len(site) From 7559f0845c30ccb63be17d17d3b7182fdb84c40e Mon Sep 17 00:00:00 2001 From: luism Date: Fri, 22 Aug 2025 11:07:47 -0400 Subject: [PATCH 3/7] fix(sample_caller): update condition to save opinions locally --- sample_caller.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sample_caller.py b/sample_caller.py index 9397e3518..5df3b4d4e 100755 --- a/sample_caller.py +++ b/sample_caller.py @@ -308,7 +308,7 @@ def scrape_court( logger.debug("\nAdding new item:") log_dict(item) - if not binaries and not save_for_manual_upload: + if not binaries: continue try: @@ -548,7 +548,7 @@ def main(): limit_per_scrape = options.limit_per_scrape save_for_manual_upload = options.save_for_manual_upload - if test_hashes: + if test_hashes or save_for_manual_upload: binaries = True if extract_content: From 9b2f133cc8b9efe68dd84a431636d9df05d58f05 Mon Sep 17 00:00:00 2001 From: Luis Manzur <83871083+Luis-manzur@users.noreply.github.com> Date: Fri, 22 Aug 2025 11:13:24 -0400 Subject: [PATCH 4/7] Update sample_caller.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- sample_caller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample_caller.py b/sample_caller.py index 5df3b4d4e..20269e102 100755 --- a/sample_caller.py +++ b/sample_caller.py @@ -249,7 +249,7 @@ def download_item(data: bytes, item, download_url: str, site, extension: str): ) os.makedirs(folder_name, exist_ok=True) - file_hash = sha1(download_url) + file_hash = hashlib.sha256(force_bytes(download_url)).hexdigest() json_path = os.path.join(folder_name, f"{file_hash}.json") content_path = os.path.join(folder_name, f"{file_hash}{extension}") From 02fc1f0ba09c3501ce0587b65c8147f9f6983c5a Mon Sep 17 00:00:00 2001 From: Luis Manzur <83871083+Luis-manzur@users.noreply.github.com> Date: Fri, 22 Aug 2025 11:13:47 -0400 Subject: [PATCH 5/7] Update sample_caller.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- sample_caller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample_caller.py b/sample_caller.py index 20269e102..2a9c695bc 100755 --- a/sample_caller.py +++ b/sample_caller.py @@ -261,7 +261,7 @@ def download_item(data: bytes, item, download_url: str, site, extension: str): item_parsed["case_dates"] = item_parsed["case_dates"].strftime( "%Y%m%d" ) - except Exception: + except (AttributeError, TypeError): pass json.dump(item_parsed, f, indent=2) # Save content From 020b5fb987bdf72adc4aeb42736531d502613039 Mon Sep 17 00:00:00 2001 From: Luis Manzur <83871083+Luis-manzur@users.noreply.github.com> Date: Fri, 22 Aug 2025 11:14:33 -0400 Subject: [PATCH 6/7] Update sample_caller.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- sample_caller.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sample_caller.py b/sample_caller.py index 2a9c695bc..6dd54e59f 100755 --- a/sample_caller.py +++ b/sample_caller.py @@ -251,7 +251,9 @@ def download_item(data: bytes, item, download_url: str, site, extension: str): file_hash = hashlib.sha256(force_bytes(download_url)).hexdigest() json_path = os.path.join(folder_name, f"{file_hash}.json") - content_path = os.path.join(folder_name, f"{file_hash}{extension}") + # Ensure extension starts with a dot + ext = extension if extension.startswith('.') else f'.{extension}' + content_path = os.path.join(folder_name, f"{file_hash}{ext}") # Save metadata with open(json_path, "w") as f: From 722948628bd525eb312ae252091364b5ec5d90da Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 22 Aug 2025 15:15:02 +0000 Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- sample_caller.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sample_caller.py b/sample_caller.py index 6dd54e59f..576d9e569 100755 --- a/sample_caller.py +++ b/sample_caller.py @@ -252,7 +252,7 @@ def download_item(data: bytes, item, download_url: str, site, extension: str): file_hash = hashlib.sha256(force_bytes(download_url)).hexdigest() json_path = os.path.join(folder_name, f"{file_hash}.json") # Ensure extension starts with a dot - ext = extension if extension.startswith('.') else f'.{extension}' + ext = extension if extension.startswith(".") else f".{extension}" content_path = os.path.join(folder_name, f"{file_hash}{ext}") # Save metadata