Skip to content

Commit c5462ba

Browse files
committed
add detailed_crossreferences
1 parent 826edbd commit c5462ba

File tree

4 files changed

+104
-29
lines changed

4 files changed

+104
-29
lines changed

__main__.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,17 @@ def get_subseqitem_conf(subseqitems):
145145
default=False,
146146
help="Include regulations",
147147
)
148+
149+
parser.add_argument(
150+
"-dc",
151+
"--detailed-crossreferences",
152+
dest="detailed_crossreferences",
153+
action="store_const",
154+
const=True,
155+
default=False,
156+
help="Resolve cross references on the lowest possible level. "
157+
"Default is to resolve on seqitem level (e.g. sections).",
158+
)
148159
args = parser.parse_args()
149160

150161
steps = [step.lower() for step in args.steps]
@@ -156,6 +167,7 @@ def get_subseqitem_conf(subseqitems):
156167
interval = args.interval
157168
selected_items = args.filter or []
158169
regulations = args.regulations
170+
detailed_crossreferences = args.detailed_crossreferences
159171

160172
if dataset not in ["de", "us"]:
161173
raise Exception(f"{dataset} unsupported dataset. Options: us, de")
@@ -300,11 +312,16 @@ def get_subseqitem_conf(subseqitems):
300312

301313
if "crossreference_lookup" in steps:
302314
if dataset == "us":
303-
step = UsCrossreferenceLookup(regulations=regulations, processes=processes)
315+
step = UsCrossreferenceLookup(
316+
detailed_crossreferences=detailed_crossreferences,
317+
regulations=regulations,
318+
processes=processes,
319+
)
304320
items = step.get_items(overwrite, snapshots)
305321
step.execute_items(items)
306322

307323
elif dataset == "de":
324+
assert not detailed_crossreferences
308325
step = DeCrossreferenceLookup(regulations=regulations, processes=processes)
309326
items = step.get_items(snapshots)
310327
step.execute_items(items)
@@ -314,12 +331,15 @@ def get_subseqitem_conf(subseqitems):
314331
if "crossreference_edgelist" in steps:
315332
if dataset == "us":
316333
step = UsCrossreferenceEdgelist(
317-
regulations=regulations, processes=processes
334+
detailed_crossreferences=detailed_crossreferences,
335+
regulations=regulations,
336+
processes=processes,
318337
)
319338
items = step.get_items(overwrite, snapshots)
320339
step.execute_items(items)
321340

322341
elif dataset == "de":
342+
assert not detailed_crossreferences
323343
law_names_data = load_law_names(regulations)
324344
step = DeCrossreferenceEdgelist(
325345
regulations=regulations,
@@ -353,14 +373,15 @@ def get_subseqitem_conf(subseqitems):
353373
US_REG_CROSSREFERENCE_GRAPH_PATH
354374
if regulations
355375
else US_CROSSREFERENCE_GRAPH_PATH
356-
)
376+
) + ("/detailed" if detailed_crossreferences else "")
357377
edgelist_folder = (
358378
US_REG_CROSSREFERENCE_EDGELIST_PATH
359379
if regulations
360380
else US_CROSSREFERENCE_EDGELIST_PATH
361-
)
381+
) + ("/detailed" if detailed_crossreferences else "")
362382
authority_edgelist_folder = US_REG_AUTHORITY_EDGELIST_PATH
363383
elif dataset == "de":
384+
assert not detailed_crossreferences
364385
source = (
365386
DE_REG_HIERARCHY_GRAPH_PATH if regulations else DE_HIERARCHY_GRAPH_PATH
366387
)
@@ -393,6 +414,7 @@ def get_subseqitem_conf(subseqitems):
393414
print("Make crossreference graph: done")
394415

395416
if "snapshot_mapping_index" in steps:
417+
assert not detailed_crossreferences
396418
if dataset == "us":
397419
source_text = (
398420
[US_REFERENCE_PARSED_PATH, US_REG_REFERENCE_PARSED_PATH]
@@ -433,6 +455,7 @@ def get_subseqitem_conf(subseqitems):
433455
print("Make snapshot mapping: done")
434456

435457
if "snapshot_mapping_edgelist" in steps:
458+
assert not detailed_crossreferences
436459
if dataset == "us":
437460
source = os.path.join(
438461
US_REG_SNAPSHOT_MAPPING_INDEX_PATH

statutes_pipeline_steps/us_crossreference_edgelist.py

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@
1717

1818

1919
class UsCrossreferenceEdgelist(RegulationsPipelineStep):
20+
def __init__(self, detailed_crossreferences, *args, **kwargs):
21+
self.detailed_crossreferences = detailed_crossreferences
22+
super().__init__(*args, **kwargs)
23+
2024
def get_items(self, overwrite, snapshots) -> list:
2125
ensure_exists(self.dest)
2226
if not snapshots:
@@ -38,15 +42,15 @@ def dest(self):
3842
US_REG_CROSSREFERENCE_EDGELIST_PATH
3943
if self.regulations
4044
else US_CROSSREFERENCE_EDGELIST_PATH
41-
)
45+
) + ("/detailed" if self.detailed_crossreferences else "")
4246

4347
@property
4448
def lookup(self):
4549
return (
4650
US_REG_CROSSREFERENCE_LOOKUP_PATH
4751
if self.regulations
4852
else US_CROSSREFERENCE_LOOKUP_PATH
49-
)
53+
) + ("/detailed" if self.detailed_crossreferences else "")
5054

5155
def execute_item(self, item):
5256
yearfiles = [
@@ -79,20 +83,29 @@ def make_edge_list(self, yearfile_path, key_dict):
7983
file_elem = lxml.etree.parse(f)
8084
edge_list = []
8185

82-
# for debug
83-
# problem_matches = set()
84-
# problem_keys = set()
85-
86-
for seqitem_elem in file_elem.xpath("//seqitem"):
87-
node_out = seqitem_elem.attrib.get("key")
88-
for ref_elem in seqitem_elem.xpath(".//reference"):
86+
if self.detailed_crossreferences:
87+
for ref_elem in file_elem.xpath(".//reference"):
88+
node_out = ref_elem.getparent().getparent().attrib.get("key")
8989
refs = json.loads(ref_elem.attrib["parsed"])
9090
for ref in refs:
91-
key = "_".join(ref[:2])
92-
node_in = key_dict.get(key)
93-
94-
if node_in:
95-
edge_list.append([node_out, node_in])
91+
for cutoff in range(len(ref), 1, -1):
92+
key = "_".join(ref[:cutoff])
93+
node_in = key_dict.get(key)
94+
if node_in:
95+
edge_list.append([node_out, node_in])
96+
break
97+
else:
98+
for seqitem_elem in file_elem.xpath("//seqitem"):
99+
node_out = seqitem_elem.attrib.get("key")
100+
for ref_elem in seqitem_elem.xpath(".//reference"):
101+
refs = json.loads(ref_elem.attrib["parsed"])
102+
for ref in refs:
103+
for cutoff in range(len(ref), 1, -1):
104+
key = "_".join(ref[:cutoff])
105+
node_in = key_dict.get(key)
106+
if node_in:
107+
edge_list.append([node_out, node_in])
108+
break
96109
return edge_list
97110

98111

statutes_pipeline_steps/us_crossreference_lookup.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,12 @@
1414

1515

1616
class UsCrossreferenceLookup(RegulationsPipelineStep):
17+
def __init__(self, detailed_crossreferences, *args, **kwargs):
18+
self.detailed_crossreferences = detailed_crossreferences
19+
super().__init__(*args, **kwargs)
20+
1721
def get_items(self, overwrite, snapshots) -> list:
18-
dest = (
19-
US_REG_CROSSREFERENCE_LOOKUP_PATH
20-
if self.regulations
21-
else US_CROSSREFERENCE_LOOKUP_PATH
22-
)
23-
ensure_exists(dest)
22+
ensure_exists(self.dest)
2423

2524
# If snapshots not set, create list of all years
2625
if not snapshots:
@@ -34,20 +33,22 @@ def get_items(self, overwrite, snapshots) -> list:
3433
)
3534

3635
if not overwrite:
37-
existing_files = os.listdir(dest)
36+
existing_files = os.listdir(self.dest)
3837
snapshots = list(
3938
filter(lambda f: get_filename(f) not in existing_files, snapshots)
4039
)
4140

4241
return snapshots
4342

44-
def execute_item(self, item):
45-
dest = (
43+
@property
44+
def dest(self):
45+
return (
4646
US_REG_CROSSREFERENCE_LOOKUP_PATH
4747
if self.regulations
4848
else US_CROSSREFERENCE_LOOKUP_PATH
49-
)
49+
) + ("/detailed" if self.detailed_crossreferences else "")
5050

51+
def execute_item(self, item):
5152
yearfiles = [
5253
os.path.join(US_REFERENCE_PARSED_PATH, x)
5354
for x in list_dir(US_REFERENCE_PARSED_PATH, ".xml")
@@ -65,8 +66,12 @@ def execute_item(self, item):
6566
file_elem = lxml.etree.parse(f)
6667
for node in file_elem.xpath("//*[@citekey]"):
6768
data.append([node.attrib["key"], node.attrib["citekey"]])
69+
if self.detailed_crossreferences:
70+
for node in file_elem.xpath("//*[@citekey_detailed]"):
71+
for citekey in node.attrib["citekey_detailed"].split(","):
72+
data.append([node.attrib["key"], citekey])
6873
df = pd.DataFrame(data, columns=["key", "citekey"])
69-
destination_file = f"{dest}/{get_filename(item)}"
74+
destination_file = f"{self.dest}/{get_filename(item)}"
7075
df.to_csv(destination_file, index=False)
7176

7277

statutes_pipeline_steps/us_to_xml.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -795,6 +795,7 @@ def export_to_xml(roots, version):
795795
soup.append(doc_to_soup(root, soup, 0, version, root=True))
796796
remove_unnecessary_subseqitems(soup)
797797
add_keys_to_items(soup, f'{root["itempathcomponents"][0]}_{version}')
798+
add_detailed_citekeys(soup)
798799
f.write(soup.encode("utf-8"))
799800

800801

@@ -838,3 +839,36 @@ def fix_nesting_errors(item, documents):
838839
+ doc["expcite"].split("!@!")[1:]
839840
)
840841
doc["expcite"] = "!@!".join(expcite)
842+
843+
844+
citekeys_detailed_pattern = re.compile(r"(?:\s*\((..?)\))?" * 4)
845+
846+
847+
def add_detailed_citekeys(soup):
848+
for subseqitem in soup.find_all("subseqitem"):
849+
if subseqitem.contents and subseqitem.contents[0].name == "text":
850+
text = subseqitem.contents and subseqitem.contents[0].text
851+
match_components = citekeys_detailed_pattern.match(text).groups()
852+
match_components = [m for m in match_components if m]
853+
if match_components:
854+
for parent in subseqitem.parents:
855+
if parent.name == "seqitem":
856+
parent_citekey = parent.attrs["citekey"]
857+
break
858+
elif "citekey_detailed" in parent.attrs:
859+
parent_citekey = parent.attrs["citekey_detailed"].split(",")[-1]
860+
break
861+
parent_citekey_components = parent_citekey.split("_")
862+
863+
# Ignore if previous component is repeated
864+
if parent_citekey_components[-1] == match_components[0]:
865+
match_components = match_components[1:]
866+
867+
citekey = parent_citekey
868+
citekeys_detailed = []
869+
for match_component in match_components:
870+
citekey = f"{citekey}_{match_component}"
871+
assert "," not in citekey, citekey
872+
citekeys_detailed.append(citekey)
873+
if citekeys_detailed:
874+
subseqitem.attrs["citekey_detailed"] = ",".join(citekeys_detailed)

0 commit comments

Comments
 (0)