diff --git a/juriscraper/pacer/docket_report.py b/juriscraper/pacer/docket_report.py index c4addc7f4..8955a5a0b 100644 --- a/juriscraper/pacer/docket_report.py +++ b/juriscraper/pacer/docket_report.py @@ -1,4 +1,5 @@ import copy +import os import pprint import re import sys @@ -22,6 +23,7 @@ from .docket_utils import normalize_party_types from .reports import BaseReport from .utils import ( + get_doc_id_prefix_from_court_id, get_file_size_str_from_tr, get_input_value_from_tr, get_pacer_doc_id_from_doc1_url, @@ -489,6 +491,34 @@ def __init__(self, court_id, pacer_session=None): self._parties = None self._docket_entries = None + @classmethod + def from_html_file(cls, path): + """Run the docket parser on an HTML file. + + This is invokved by by the test runner + (tests/local/test_DocketParseTest.py), as well as the __main__ + of this file, as well juriscraper/pacerdocket.py. + + Bring the common code in here. + """ + + dirname, filename = os.path.split(path) + filename_sans_ext = filename.split(".")[0] + court = filename_sans_ext.split("_")[0] + # If filename doesn't begin with a valid court, just use 'cand' + # (N.D. Cal.) Historically the __main__ runner would default + # to 'cand' but the pacerdocket.py runner would default to + # 'psc', but the 'psc' fails some vaidations. + try: + _ = get_doc_id_prefix_from_court_id(court) + except KeyError: + court = "cand" + + report = DocketReport(court) + with open(path, "rb") as f: + report._parse_text(f.read().decode("utf-8")) + return report + @property def docket_report_has_content(self) -> bool: """Checks if the docket report has content. @@ -1749,10 +1779,6 @@ def _get_judge(self, regex): print("Usage: python -m juriscraper.pacer.docket_report filepath") print("Please provide a path to an HTML file to parse.") sys.exit(1) - report = DocketReport("cand") # Court ID is only needed for querying. - filepath = sys.argv[1] - print(f"Parsing HTML file at {filepath}") - with open(filepath) as f: - text = f.read() - report._parse_text(text) + + report = DocketReport.from_html_file(sys.argv[1]) pprint.pprint(report.data, indent=2) diff --git a/juriscraper/pacerdocket.py b/juriscraper/pacerdocket.py index 09b94d45b..e090a6ef3 100755 --- a/juriscraper/pacerdocket.py +++ b/juriscraper/pacerdocket.py @@ -8,13 +8,11 @@ import jsondate3 as json from juriscraper.pacer import DocketReport -from juriscraper.pacer.http import PacerSession - -pacer_session = PacerSession(username="tr1234", password="Pass!234") -report = DocketReport("psc", pacer_session) for path in sys.argv[1:]: - with open(path) as f: - report._parse_text(f.read()) - data = report.data - print(json.dumps(data, indent=2, sort_keys=True, separators=(",", ": "))) + report = DocketReport.from_html_file(path) + print( + json.dumps( + report.data, indent=2, sort_keys=True, separators=(",", ": ") + ) + ) diff --git a/tests/local/test_DocketParseTest.py b/tests/local/test_DocketParseTest.py index 44eb287e9..c7bf92375 100644 --- a/tests/local/test_DocketParseTest.py +++ b/tests/local/test_DocketParseTest.py @@ -40,11 +40,10 @@ def run_parsers_on_path( dirname, filename = os.path.split(path) filename_sans_ext = filename.split(".")[0] json_path = os.path.join(dirname, f"{filename_sans_ext}.json") + court = filename_sans_ext.split("_")[0] + report = DocketReport.from_html_file(path) - report = DocketReport(court) - with open(path, "rb") as f: - report._parse_text(f.read().decode("utf-8")) data = report.data if data != {}: