From 56af059455abc64ddaff5d01528ce47c527d0768 Mon Sep 17 00:00:00 2001
From: luism <luismanzur91@outlook.com>
Date: Fri, 8 Aug 2025 12:00:01 -0400
Subject: [PATCH 1/7] feat(scraper): add option to save case metadata and
 content for manual upload

---
 CHANGES.md       |  3 ++-
 sample_caller.py | 63 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 3c995ad80..6332861cc 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -15,7 +15,8 @@ Releases are also tagged in git, if that's helpful.
 The following changes are not yet released, but are code complete:
 
 Features:
--
+- add new flag `--save-for-manual-upload` to `sample_caller` to save
+  the scraped data for manual upload to CourtListener.
 
 Changes:
 -
diff --git a/sample_caller.py b/sample_caller.py
index e72161d79..9e68de16d 100755
--- a/sample_caller.py
+++ b/sample_caller.py
@@ -1,6 +1,7 @@
 import hashlib
 import json
 import logging
+import mimetypes
 import os
 import re
 import signal
@@ -239,6 +240,53 @@ def check_hashes(data: bytes, download_url: str, site) -> None:
         logger.info("Same URL hashes are the same. It's OK")
 
 
+def download_item(data: bytes, item, download_url: str, site):
+    """Save each case's metadata and content for manual upload."""
+
+    # Create a folder named after the court_id
+    folder_name = f"./{site.court_id.replace('.', '_')}"
+    os.makedirs(folder_name, exist_ok=True)
+
+    # Get extension from response headers if possible
+    ext = None
+    if "Content-Type" in site.request["response"].headers:
+        content_type = (
+            site.request["response"]
+            .headers["Content-Type"]
+            .split(";")[0]
+            .strip()
+        )
+        ext = mimetypes.guess_extension(content_type)
+    if not ext:
+        # Fallback: try to get extension from download_url
+        ext = os.path.splitext(parse.unquote(download_url))[1]
+        if not ext:
+            ext = ".bin"  # default if unknown
+
+    file_hash = sha1(download_url)
+    json_path = os.path.join(folder_name, f"{file_hash}.json")
+    content_path = os.path.join(folder_name, f"{file_hash}{ext}")
+
+    # Save metadata
+    with open(json_path, "w") as f:
+        item_parsed = item.copy()
+        if "case_dates" in item_parsed:
+            try:
+                item_parsed["case_dates"] = item_parsed["case_dates"].strftime(
+                    "%d/%m/%Y"
+                )
+            except Exception:
+                pass
+        json.dump(item_parsed, f, indent=2)
+    # Save content
+    if isinstance(data, (bytes, bytearray)):
+        with open(content_path, "wb") as f:
+            f.write(data)
+    else:
+        with open(content_path, "w", encoding="utf-8") as f:
+            f.write(data)
+
+
 def scrape_court(
     site,
     binaries=False,
@@ -246,6 +294,7 @@ def scrape_court(
     doctor_host="",
     test_hashes: bool = False,
     limit: int = 1000,
+    save_for_manual_upload: bool = False,
 ):
     """Calls the requested court(s), gets its binary content, and
     extracts the content if possible. See --extract-content option
@@ -273,7 +322,7 @@ def scrape_court(
         logger.debug("\nAdding new item:")
         log_dict(item)
 
-        if not binaries:
+        if not binaries and not save_for_manual_upload:
             continue
 
         try:
@@ -316,6 +365,9 @@ def scrape_court(
         # Separate cases for easier reading when verbosity=DEBUG
         logger.debug("\n%s\n", "=" * 60)
 
+        if save_for_manual_upload:
+            download_item(data, item, download_url, site)
+
     logger.info(
         "\n%s: Successfully crawled %s items.", site.court_id, len(site)
     )
@@ -487,6 +539,13 @@ def main():
         help="How many items to scrape per `scrape_court` call",
     )
 
+    parser.add_option(
+        "--save-for-manual-upload",
+        action="store_true",
+        default=False,
+        help="Save each case's metadata and content for manual upload. Files are named with a hash and stored in a folder named after the court_id.",
+    )
+
     (options, args) = parser.parse_args()
 
     court_id = options.court_id
@@ -501,6 +560,7 @@ def main():
     save_responses = options.save_responses
     test_hashes = options.test_hashes
     limit_per_scrape = options.limit_per_scrape
+    save_for_manual_upload = options.save_for_manual_upload
 
     if test_hashes:
         binaries = True
@@ -572,6 +632,7 @@ def main():
                     doctor_host,
                     test_hashes,
                     limit_per_scrape,
+                    save_for_manual_upload,
                 )
 
     logger.debug("The scraper has stopped.")

From 9e7a2f13f16186ccb88897f5c7f3887c849d6e7f Mon Sep 17 00:00:00 2001
From: luism <luismanzur91@outlook.com>
Date: Mon, 11 Aug 2025 13:53:00 -0400
Subject: [PATCH 2/7] feat(sample_caller): move extension handling for
 downloaded documents to doctor

---
 sample_caller.py | 38 ++++++++++++--------------------------
 1 file changed, 12 insertions(+), 26 deletions(-)

diff --git a/sample_caller.py b/sample_caller.py
index 9e68de16d..9397e3518 100755
--- a/sample_caller.py
+++ b/sample_caller.py
@@ -1,7 +1,6 @@
 import hashlib
 import json
 import logging
-import mimetypes
 import os
 import re
 import signal
@@ -107,8 +106,6 @@ def extract_doc_content(
         the extracted content
         the structured metadata parsed by Site.extract_from_text
     """
-    if not extract_from_text:
-        return data, {}
 
     # Get the file type from the document's raw content
     extension_url = MICROSERVICE_URLS["buffer-extension"].format(doctor_host)
@@ -118,6 +115,9 @@ def extract_doc_content(
     extension_response.raise_for_status()
     extension = extension_response.text
 
+    if not extract_from_text:
+        return data, {}, extension
+
     files = {"file": (f"something.{extension}", data)}
     url = MICROSERVICE_URLS["document-extract"].format(doctor_host)
     extraction__response = requests.post(url, files=files, timeout=120)
@@ -157,7 +157,7 @@ def extract_doc_content(
     logger.info("\nOpen extracted content with 'file://%s'", filepath)
 
     metadata_dict = site.extract_from_text(extracted_content)
-    return extracted_content, metadata_dict
+    return extracted_content, metadata_dict, extension
 
 
 def get_binary_content(download_url: str, site, exceptions) -> bytes:
@@ -240,32 +240,18 @@ def check_hashes(data: bytes, download_url: str, site) -> None:
         logger.info("Same URL hashes are the same. It's OK")
 
 
-def download_item(data: bytes, item, download_url: str, site):
+def download_item(data: bytes, item, download_url: str, site, extension: str):
     """Save each case's metadata and content for manual upload."""
 
     # Create a folder named after the court_id
-    folder_name = f"./{site.court_id.replace('.', '_')}"
+    folder_name = os.path.join(
+        os.path.expanduser("~"), "Downloads", site.court_id.replace(".", "_")
+    )
     os.makedirs(folder_name, exist_ok=True)
 
-    # Get extension from response headers if possible
-    ext = None
-    if "Content-Type" in site.request["response"].headers:
-        content_type = (
-            site.request["response"]
-            .headers["Content-Type"]
-            .split(";")[0]
-            .strip()
-        )
-        ext = mimetypes.guess_extension(content_type)
-    if not ext:
-        # Fallback: try to get extension from download_url
-        ext = os.path.splitext(parse.unquote(download_url))[1]
-        if not ext:
-            ext = ".bin"  # default if unknown
-
     file_hash = sha1(download_url)
     json_path = os.path.join(folder_name, f"{file_hash}.json")
-    content_path = os.path.join(folder_name, f"{file_hash}{ext}")
+    content_path = os.path.join(folder_name, f"{file_hash}{extension}")
 
     # Save metadata
     with open(json_path, "w") as f:
@@ -273,7 +259,7 @@ def download_item(data: bytes, item, download_url: str, site):
         if "case_dates" in item_parsed:
             try:
                 item_parsed["case_dates"] = item_parsed["case_dates"].strftime(
-                    "%d/%m/%Y"
+                    "%Y%m%d"
                 )
             except Exception:
                 pass
@@ -346,7 +332,7 @@ def scrape_court(
 
         filename = item["case_names"].lower().replace(" ", "_")[:40]
 
-        data, metadata_from_text = extract_doc_content(
+        data, metadata_from_text, extension = extract_doc_content(
             data, extract_content, site, doctor_host, filename
         )
         logger.log(
@@ -366,7 +352,7 @@ def scrape_court(
         logger.debug("\n%s\n", "=" * 60)
 
         if save_for_manual_upload:
-            download_item(data, item, download_url, site)
+            download_item(data, item, download_url, site, extension)
 
     logger.info(
         "\n%s: Successfully crawled %s items.", site.court_id, len(site)

From 7559f0845c30ccb63be17d17d3b7182fdb84c40e Mon Sep 17 00:00:00 2001
From: luism <luismanzur91@outlook.com>
Date: Fri, 22 Aug 2025 11:07:47 -0400
Subject: [PATCH 3/7] fix(sample_caller): update condition to save opinions
 locally

---
 sample_caller.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sample_caller.py b/sample_caller.py
index 9397e3518..5df3b4d4e 100755
--- a/sample_caller.py
+++ b/sample_caller.py
@@ -308,7 +308,7 @@ def scrape_court(
         logger.debug("\nAdding new item:")
         log_dict(item)
 
-        if not binaries and not save_for_manual_upload:
+        if not binaries:
             continue
 
         try:
@@ -548,7 +548,7 @@ def main():
     limit_per_scrape = options.limit_per_scrape
     save_for_manual_upload = options.save_for_manual_upload
 
-    if test_hashes:
+    if test_hashes or save_for_manual_upload:
         binaries = True
 
     if extract_content:

From 9b2f133cc8b9efe68dd84a431636d9df05d58f05 Mon Sep 17 00:00:00 2001
From: Luis Manzur <83871083+Luis-manzur@users.noreply.github.com>
Date: Fri, 22 Aug 2025 11:13:24 -0400
Subject: [PATCH 4/7] Update sample_caller.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 sample_caller.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sample_caller.py b/sample_caller.py
index 5df3b4d4e..20269e102 100755
--- a/sample_caller.py
+++ b/sample_caller.py
@@ -249,7 +249,7 @@ def download_item(data: bytes, item, download_url: str, site, extension: str):
     )
     os.makedirs(folder_name, exist_ok=True)
 
-    file_hash = sha1(download_url)
+    file_hash = hashlib.sha256(force_bytes(download_url)).hexdigest()
     json_path = os.path.join(folder_name, f"{file_hash}.json")
     content_path = os.path.join(folder_name, f"{file_hash}{extension}")
 

From 02fc1f0ba09c3501ce0587b65c8147f9f6983c5a Mon Sep 17 00:00:00 2001
From: Luis Manzur <83871083+Luis-manzur@users.noreply.github.com>
Date: Fri, 22 Aug 2025 11:13:47 -0400
Subject: [PATCH 5/7] Update sample_caller.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 sample_caller.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sample_caller.py b/sample_caller.py
index 20269e102..2a9c695bc 100755
--- a/sample_caller.py
+++ b/sample_caller.py
@@ -261,7 +261,7 @@ def download_item(data: bytes, item, download_url: str, site, extension: str):
                 item_parsed["case_dates"] = item_parsed["case_dates"].strftime(
                     "%Y%m%d"
                 )
-            except Exception:
+            except (AttributeError, TypeError):
                 pass
         json.dump(item_parsed, f, indent=2)
     # Save content

From 020b5fb987bdf72adc4aeb42736531d502613039 Mon Sep 17 00:00:00 2001
From: Luis Manzur <83871083+Luis-manzur@users.noreply.github.com>
Date: Fri, 22 Aug 2025 11:14:33 -0400
Subject: [PATCH 6/7] Update sample_caller.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 sample_caller.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sample_caller.py b/sample_caller.py
index 2a9c695bc..6dd54e59f 100755
--- a/sample_caller.py
+++ b/sample_caller.py
@@ -251,7 +251,9 @@ def download_item(data: bytes, item, download_url: str, site, extension: str):
 
     file_hash = hashlib.sha256(force_bytes(download_url)).hexdigest()
     json_path = os.path.join(folder_name, f"{file_hash}.json")
-    content_path = os.path.join(folder_name, f"{file_hash}{extension}")
+    # Ensure extension starts with a dot
+    ext = extension if extension.startswith('.') else f'.{extension}'
+    content_path = os.path.join(folder_name, f"{file_hash}{ext}")
 
     # Save metadata
     with open(json_path, "w") as f:

From 722948628bd525eb312ae252091364b5ec5d90da Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 22 Aug 2025 15:15:02 +0000
Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 sample_caller.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sample_caller.py b/sample_caller.py
index 6dd54e59f..576d9e569 100755
--- a/sample_caller.py
+++ b/sample_caller.py
@@ -252,7 +252,7 @@ def download_item(data: bytes, item, download_url: str, site, extension: str):
     file_hash = hashlib.sha256(force_bytes(download_url)).hexdigest()
     json_path = os.path.join(folder_name, f"{file_hash}.json")
     # Ensure extension starts with a dot
-    ext = extension if extension.startswith('.') else f'.{extension}'
+    ext = extension if extension.startswith(".") else f".{extension}"
     content_path = os.path.join(folder_name, f"{file_hash}{ext}")
 
     # Save metadata