Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 32 additions & 6 deletions juriscraper/pacer/docket_report.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import copy
import os
import pprint
import re
import sys
Expand All @@ -22,6 +23,7 @@
from .docket_utils import normalize_party_types
from .reports import BaseReport
from .utils import (
get_doc_id_prefix_from_court_id,
get_file_size_str_from_tr,
get_input_value_from_tr,
get_pacer_doc_id_from_doc1_url,
Expand Down Expand Up @@ -489,6 +491,34 @@ def __init__(self, court_id, pacer_session=None):
self._parties = None
self._docket_entries = None

@classmethod
def from_html_file(cls, path):
"""Run the docket parser on an HTML file.

This is invokved by by the test runner
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
This is invokved by by the test runner
This is invokved by the test runner

(tests/local/test_DocketParseTest.py), as well as the __main__
of this file, as well juriscraper/pacerdocket.py.

Bring the common code in here.
"""

dirname, filename = os.path.split(path)
filename_sans_ext = filename.split(".")[0]
court = filename_sans_ext.split("_")[0]
# If filename doesn't begin with a valid court, just use 'cand'
# (N.D. Cal.) Historically the __main__ runner would default
# to 'cand' but the pacerdocket.py runner would default to
# 'psc', but the 'psc' fails some vaidations.
try:
_ = get_doc_id_prefix_from_court_id(court)
except KeyError:
court = "cand"
Comment on lines +505 to +515
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the change, but I think it might be better to just make the court a requirement. Maybe we allow it to be passed as an optional argument if we can't require its use in input?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your intent is not particularly clear to me, and there are a million options.
I chose the simplest, moving the filename-deriving code from the test runner into the class method, and choosing one of the two previous disparate defaults.

If by "requirement" you mean pass it in as a parameter, then you want to put the filename-based code back in the test runner? And force the user who invokes the other two mechanisms to specify it?

So you could no longer run

python pacerdocket.py filename.html

like you've always been able to, you need to run

python pacerdocket.py nysd filename.html

? If that's the proposal, it seems to be more annoying to the user than we were prior to this PR, and not particularly helpful in solving any diagnostic problem.

I'm also not sure what would happen if there were an optional argument. So if you didn't specify it, then it would default to…what?cand? psc? Or it would return no court (or None) in all the places that this is used?

I don't have a full assessment of what they all are, but somewhere in the hairy machinery of "html cleaning", juriscraper rewrites relative URLs to absolute URLs using the court name. You would just leave them as relative URLs then?

All of these choices seem more complicated, but let me know what you want.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking it becomes:

def from_html_file(cls, path, court=None):

If the court isn't passed, it tries from the file name. If that fails, it crashes. I think that'd make it so that places that need to be explicit can be and so that the court parameter isn't necessary.

I don't like having a default to psc or cand or whatever in a class method since that could slip into a production usage, so crashing should prevent that.


report = DocketReport(court)
with open(path, "rb") as f:
report._parse_text(f.read().decode("utf-8"))
return report

@property
def docket_report_has_content(self) -> bool:
"""Checks if the docket report has content.
Expand Down Expand Up @@ -1749,10 +1779,6 @@ def _get_judge(self, regex):
print("Usage: python -m juriscraper.pacer.docket_report filepath")
print("Please provide a path to an HTML file to parse.")
sys.exit(1)
report = DocketReport("cand") # Court ID is only needed for querying.
filepath = sys.argv[1]
print(f"Parsing HTML file at {filepath}")
with open(filepath) as f:
text = f.read()
report._parse_text(text)

report = DocketReport.from_html_file(sys.argv[1])
pprint.pprint(report.data, indent=2)
14 changes: 6 additions & 8 deletions juriscraper/pacerdocket.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,11 @@
import jsondate3 as json

from juriscraper.pacer import DocketReport
from juriscraper.pacer.http import PacerSession

pacer_session = PacerSession(username="tr1234", password="Pass!234")
report = DocketReport("psc", pacer_session)

for path in sys.argv[1:]:
with open(path) as f:
report._parse_text(f.read())
data = report.data
print(json.dumps(data, indent=2, sort_keys=True, separators=(",", ": ")))
report = DocketReport.from_html_file(path)
print(
json.dumps(
report.data, indent=2, sort_keys=True, separators=(",", ": ")
)
)
5 changes: 2 additions & 3 deletions tests/local/test_DocketParseTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,10 @@ def run_parsers_on_path(
dirname, filename = os.path.split(path)
filename_sans_ext = filename.split(".")[0]
json_path = os.path.join(dirname, f"{filename_sans_ext}.json")

court = filename_sans_ext.split("_")[0]
report = DocketReport.from_html_file(path)

report = DocketReport(court)
with open(path, "rb") as f:
report._parse_text(f.read().decode("utf-8"))
data = report.data

if data != {}:
Expand Down
Loading