add detailed_crossreferences

beckedorf · beckedorf · commit c5462ba946e8 · 2021-06-14T16:27:45.000+02:00
diff --git a/__main__.py b/__main__.py
@@ -145,6 +145,17 @@ def get_subseqitem_conf(subseqitems):
         default=False,
         help="Include regulations",
     )
+
+    parser.add_argument(
+        "-dc",
+        "--detailed-crossreferences",
+        dest="detailed_crossreferences",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Resolve cross references on the lowest possible level. "
+        "Default is to resolve on seqitem level (e.g. sections).",
+    )
     args = parser.parse_args()
 
     steps = [step.lower() for step in args.steps]
@@ -156,6 +167,7 @@ def get_subseqitem_conf(subseqitems):
     interval = args.interval
     selected_items = args.filter or []
     regulations = args.regulations
+    detailed_crossreferences = args.detailed_crossreferences
 
     if dataset not in ["de", "us"]:
         raise Exception(f"{dataset} unsupported dataset. Options: us, de")
@@ -300,11 +312,16 @@ def get_subseqitem_conf(subseqitems):
 
     if "crossreference_lookup" in steps:
         if dataset == "us":
-            step = UsCrossreferenceLookup(regulations=regulations, processes=processes)
+            step = UsCrossreferenceLookup(
+                detailed_crossreferences=detailed_crossreferences,
+                regulations=regulations,
+                processes=processes,
+            )
             items = step.get_items(overwrite, snapshots)
             step.execute_items(items)
 
         elif dataset == "de":
+            assert not detailed_crossreferences
             step = DeCrossreferenceLookup(regulations=regulations, processes=processes)
             items = step.get_items(snapshots)
             step.execute_items(items)
@@ -314,12 +331,15 @@ def get_subseqitem_conf(subseqitems):
     if "crossreference_edgelist" in steps:
         if dataset == "us":
             step = UsCrossreferenceEdgelist(
-                regulations=regulations, processes=processes
+                detailed_crossreferences=detailed_crossreferences,
+                regulations=regulations,
+                processes=processes,
             )
             items = step.get_items(overwrite, snapshots)
             step.execute_items(items)
 
         elif dataset == "de":
+            assert not detailed_crossreferences
             law_names_data = load_law_names(regulations)
             step = DeCrossreferenceEdgelist(
                 regulations=regulations,
@@ -353,14 +373,15 @@ def get_subseqitem_conf(subseqitems):
                 US_REG_CROSSREFERENCE_GRAPH_PATH
                 if regulations
                 else US_CROSSREFERENCE_GRAPH_PATH
-            )
+            ) + ("/detailed" if detailed_crossreferences else "")
             edgelist_folder = (
                 US_REG_CROSSREFERENCE_EDGELIST_PATH
                 if regulations
                 else US_CROSSREFERENCE_EDGELIST_PATH
-            )
+            ) + ("/detailed" if detailed_crossreferences else "")
             authority_edgelist_folder = US_REG_AUTHORITY_EDGELIST_PATH
         elif dataset == "de":
+            assert not detailed_crossreferences
             source = (
                 DE_REG_HIERARCHY_GRAPH_PATH if regulations else DE_HIERARCHY_GRAPH_PATH
             )
@@ -393,6 +414,7 @@ def get_subseqitem_conf(subseqitems):
         print("Make crossreference graph: done")
 
     if "snapshot_mapping_index" in steps:
+        assert not detailed_crossreferences
         if dataset == "us":
             source_text = (
                 [US_REFERENCE_PARSED_PATH, US_REG_REFERENCE_PARSED_PATH]
@@ -433,6 +455,7 @@ def get_subseqitem_conf(subseqitems):
         print("Make snapshot mapping: done")
 
     if "snapshot_mapping_edgelist" in steps:
+        assert not detailed_crossreferences
         if dataset == "us":
             source = os.path.join(
                 US_REG_SNAPSHOT_MAPPING_INDEX_PATH
diff --git a/statutes_pipeline_steps/us_crossreference_edgelist.py b/statutes_pipeline_steps/us_crossreference_edgelist.py
@@ -17,6 +17,10 @@
 
 
 class UsCrossreferenceEdgelist(RegulationsPipelineStep):
+    def __init__(self, detailed_crossreferences, *args, **kwargs):
+        self.detailed_crossreferences = detailed_crossreferences
+        super().__init__(*args, **kwargs)
+
     def get_items(self, overwrite, snapshots) -> list:
         ensure_exists(self.dest)
         if not snapshots:
@@ -38,15 +42,15 @@ def dest(self):
             US_REG_CROSSREFERENCE_EDGELIST_PATH
             if self.regulations
             else US_CROSSREFERENCE_EDGELIST_PATH
-        )
+        ) + ("/detailed" if self.detailed_crossreferences else "")
 
     @property
     def lookup(self):
         return (
             US_REG_CROSSREFERENCE_LOOKUP_PATH
             if self.regulations
             else US_CROSSREFERENCE_LOOKUP_PATH
-        )
+        ) + ("/detailed" if self.detailed_crossreferences else "")
 
     def execute_item(self, item):
         yearfiles = [
@@ -79,20 +83,29 @@ def make_edge_list(self, yearfile_path, key_dict):
             file_elem = lxml.etree.parse(f)
         edge_list = []
 
-        # for debug
-        # problem_matches = set()
-        # problem_keys = set()
-
-        for seqitem_elem in file_elem.xpath("//seqitem"):
-            node_out = seqitem_elem.attrib.get("key")
-            for ref_elem in seqitem_elem.xpath(".//reference"):
+        if self.detailed_crossreferences:
+            for ref_elem in file_elem.xpath(".//reference"):
+                node_out = ref_elem.getparent().getparent().attrib.get("key")
                 refs = json.loads(ref_elem.attrib["parsed"])
                 for ref in refs:
-                    key = "_".join(ref[:2])
-                    node_in = key_dict.get(key)
-
-                    if node_in:
-                        edge_list.append([node_out, node_in])
+                    for cutoff in range(len(ref), 1, -1):
+                        key = "_".join(ref[:cutoff])
+                        node_in = key_dict.get(key)
+                        if node_in:
+                            edge_list.append([node_out, node_in])
+                            break
+        else:
+            for seqitem_elem in file_elem.xpath("//seqitem"):
+                node_out = seqitem_elem.attrib.get("key")
+                for ref_elem in seqitem_elem.xpath(".//reference"):
+                    refs = json.loads(ref_elem.attrib["parsed"])
+                    for ref in refs:
+                        for cutoff in range(len(ref), 1, -1):
+                            key = "_".join(ref[:cutoff])
+                            node_in = key_dict.get(key)
+                            if node_in:
+                                edge_list.append([node_out, node_in])
+                                break
         return edge_list
 
 
diff --git a/statutes_pipeline_steps/us_crossreference_lookup.py b/statutes_pipeline_steps/us_crossreference_lookup.py
@@ -14,13 +14,12 @@
 
 
 class UsCrossreferenceLookup(RegulationsPipelineStep):
+    def __init__(self, detailed_crossreferences, *args, **kwargs):
+        self.detailed_crossreferences = detailed_crossreferences
+        super().__init__(*args, **kwargs)
+
     def get_items(self, overwrite, snapshots) -> list:
-        dest = (
-            US_REG_CROSSREFERENCE_LOOKUP_PATH
-            if self.regulations
-            else US_CROSSREFERENCE_LOOKUP_PATH
-        )
-        ensure_exists(dest)
+        ensure_exists(self.dest)
 
         # If snapshots not set, create list of all years
         if not snapshots:
@@ -34,20 +33,22 @@ def get_items(self, overwrite, snapshots) -> list:
             )
 
         if not overwrite:
-            existing_files = os.listdir(dest)
+            existing_files = os.listdir(self.dest)
             snapshots = list(
                 filter(lambda f: get_filename(f) not in existing_files, snapshots)
             )
 
         return snapshots
 
-    def execute_item(self, item):
-        dest = (
+    @property
+    def dest(self):
+        return (
             US_REG_CROSSREFERENCE_LOOKUP_PATH
             if self.regulations
             else US_CROSSREFERENCE_LOOKUP_PATH
-        )
+        ) + ("/detailed" if self.detailed_crossreferences else "")
 
+    def execute_item(self, item):
         yearfiles = [
             os.path.join(US_REFERENCE_PARSED_PATH, x)
             for x in list_dir(US_REFERENCE_PARSED_PATH, ".xml")
@@ -65,8 +66,12 @@ def execute_item(self, item):
                 file_elem = lxml.etree.parse(f)
             for node in file_elem.xpath("//*[@citekey]"):
                 data.append([node.attrib["key"], node.attrib["citekey"]])
+            if self.detailed_crossreferences:
+                for node in file_elem.xpath("//*[@citekey_detailed]"):
+                    for citekey in node.attrib["citekey_detailed"].split(","):
+                        data.append([node.attrib["key"], citekey])
         df = pd.DataFrame(data, columns=["key", "citekey"])
-        destination_file = f"{dest}/{get_filename(item)}"
+        destination_file = f"{self.dest}/{get_filename(item)}"
         df.to_csv(destination_file, index=False)
 
 
diff --git a/statutes_pipeline_steps/us_to_xml.py b/statutes_pipeline_steps/us_to_xml.py
@@ -795,6 +795,7 @@ def export_to_xml(roots, version):
             soup.append(doc_to_soup(root, soup, 0, version, root=True))
             remove_unnecessary_subseqitems(soup)
             add_keys_to_items(soup, f'{root["itempathcomponents"][0]}_{version}')
+            add_detailed_citekeys(soup)
             f.write(soup.encode("utf-8"))
 
 
@@ -838,3 +839,36 @@ def fix_nesting_errors(item, documents):
                     + doc["expcite"].split("!@!")[1:]
                 )
                 doc["expcite"] = "!@!".join(expcite)
+
+
+citekeys_detailed_pattern = re.compile(r"(?:\s*\((..?)\))?" * 4)
+
+
+def add_detailed_citekeys(soup):
+    for subseqitem in soup.find_all("subseqitem"):
+        if subseqitem.contents and subseqitem.contents[0].name == "text":
+            text = subseqitem.contents and subseqitem.contents[0].text
+            match_components = citekeys_detailed_pattern.match(text).groups()
+            match_components = [m for m in match_components if m]
+            if match_components:
+                for parent in subseqitem.parents:
+                    if parent.name == "seqitem":
+                        parent_citekey = parent.attrs["citekey"]
+                        break
+                    elif "citekey_detailed" in parent.attrs:
+                        parent_citekey = parent.attrs["citekey_detailed"].split(",")[-1]
+                        break
+                parent_citekey_components = parent_citekey.split("_")
+
+                # Ignore if previous component is repeated
+                if parent_citekey_components[-1] == match_components[0]:
+                    match_components = match_components[1:]
+
+                citekey = parent_citekey
+                citekeys_detailed = []
+                for match_component in match_components:
+                    citekey = f"{citekey}_{match_component}"
+                    assert "," not in citekey, citekey
+                    citekeys_detailed.append(citekey)
+                if citekeys_detailed:
+                    subseqitem.attrs["citekey_detailed"] = ",".join(citekeys_detailed)