Skip to content

Commit 282ad27

Browse files
committed
Make steps optional and add debug
1 parent e248c0d commit 282ad27

File tree

2 files changed

+14
-15
lines changed

2 files changed

+14
-15
lines changed

entity_graph/graph_extractor/entities_graph_extractor.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def _add_table(self, json_data, file_name, table_name, data_type):
9696

9797
def _add_identifiers(self):
9898
# Create new identifiers
99-
for name, table_name, cols, fill_values in self.extraction_plan["identifiers"]:
99+
for name, table_name, cols, fill_values in self.extraction_plan.get("identifiers", []):
100100
new_id(
101101
self.entities_graph_manager,
102102
name,
@@ -107,9 +107,7 @@ def _add_identifiers(self):
107107
)
108108

109109
# Link identifiers
110-
for name, table_name, cols, fill_values in self.extraction_plan[
111-
"identifiers_links"
112-
]:
110+
for name, table_name, cols, fill_values in self.extraction_plan.get("identifiers_links", []):
113111
link_id(
114112
self.entities_graph_manager,
115113
name,
@@ -120,9 +118,7 @@ def _add_identifiers(self):
120118
)
121119

122120
def _make_instances(self):
123-
for id_name, table_name, do_hierarchy, override_cols in self.extraction_plan[
124-
"instances_creation"
125-
]:
121+
for id_name, table_name, do_hierarchy, override_cols in self.extraction_plan.get("instances_creation", []):
126122
# Make sure to pass the collection parameter
127123
create_instances(
128124
self.entities_graph_manager,
@@ -138,7 +134,7 @@ def _enrichment_matching(self):
138134
Perform enrichment matching based on the extraction plan.
139135
Uses self.collection for collection parameter.
140136
"""
141-
for id_name, attr_name, key in self.extraction_plan["enrichment_links"]:
137+
for id_name, attr_name, key in self.extraction_plan.get("enrichment_links", []):
142138
# Pass the collection parameter
143139
add_enrichment_links(
144140
self.entities_graph_manager,
@@ -148,9 +144,7 @@ def _enrichment_matching(self):
148144
collection=self.collection,
149145
)
150146

151-
for table_name, label1, label2, new_labels in self.extraction_plan[
152-
"enrichments"
153-
]:
147+
for table_name, label1, label2, new_labels in self.extraction_plan.get("enrichments", []):
154148
# Pass the collection parameter
155149
enrich_from_table_name(
156150
self.entities_graph_manager,
@@ -166,7 +160,7 @@ def _enrichment_models(self):
166160
Perform enrichment with models based on the extraction plan.
167161
Uses self.collection for collection parameter.
168162
"""
169-
for id_name in self.extraction_plan["identifiers_to_enrich"]:
163+
for id_name in self.extraction_plan.get("identifiers_to_enrich", []):
170164
# Pass the collection parameter
171165
enrich_identifier_values(
172166
self.entities_graph_manager, id_name, collection=self.collection
@@ -239,13 +233,12 @@ def _join_identifiers(self):
239233
print(
240234
f"Name map has {len(self.entities_graph_manager.get_name_map(collection=self.collection))} entries in collection '{self.collection}'"
241235
)
242-
243-
for linking_column in self.extraction_plan["linking_columns"]:
236+
join_count = 0
237+
for linking_column in self.extraction_plan.get("linking_columns", []):
244238
print(
245239
f"Processing linking column: {linking_column} in collection '{self.collection}'"
246240
)
247241

248-
join_count = 0
249242
# Filter entities by collection
250243
entities = [
251244
e

entity_graph/graph_extractor/processing/extraction_table.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def split_column(df, column_name, sep):
5858

5959
def get_all_matching_tables(
6060
config: TableFromHeaderExtracionConfig,
61+
debug: bool = False,
6162
):
6263
"""
6364
Extracts all tables from a PDF file that match a given condition.
@@ -90,6 +91,11 @@ def get_all_matching_tables(
9091
for table_number, table in enumerate(tables, start=1):
9192
if table_test(table):
9293
oktables.append(table)
94+
elif debug:
95+
print("not ok", table[0])
96+
97+
if debug:
98+
return oktables
9399

94100
all_elements = pd.concat(
95101
[pd.DataFrame(okt[1:], columns=okt[0]) for okt in oktables], ignore_index=True

0 commit comments

Comments
 (0)