Skip to content

Commit 3df8f86

Browse files
authored
Factor Query enrichment chunking into helper (#52)
1 parent 89501d1 commit 3df8f86

File tree

1 file changed

+35
-41
lines changed

1 file changed

+35
-41
lines changed

pandasaurus/query.py

Lines changed: 35 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -106,16 +106,10 @@ def minimal_slim_enrichment(self, slim_list: List[str]) -> pd.DataFrame:
106106
"""
107107
source_list = [term.get_iri() for term in self._term_list]
108108
object_list = list(set(source_list + SlimManager.get_slim_members(slim_list)))
109-
s_result = []
110-
for chunk in chunks(object_list, 90):
111-
s_result.extend(
112-
[
113-
res
114-
for res in run_sparql_query(
115-
get_simple_enrichment_query(source_list, chunk, self._enrichment_property_list)
116-
)
117-
]
118-
)
109+
s_result = self._batched_enrichment_results(
110+
object_list,
111+
lambda chunk: get_simple_enrichment_query(source_list, chunk, self._enrichment_property_list),
112+
)
119113
self.enriched_df = (
120114
pd.DataFrame(s_result, columns=["s", "s_label", "p", "o", "o_label"])
121115
.sort_values("s")
@@ -140,9 +134,10 @@ def full_slim_enrichment(self, slim_list: List[str]) -> pd.DataFrame:
140134
"""
141135
source_list = [term.get_iri() for term in self._term_list]
142136
object_list = list(set(source_list + SlimManager.get_slim_members(slim_list)))
143-
s_result = []
144-
for chunk in chunks(object_list, 90):
145-
s_result.extend([res for res in run_sparql_query(get_full_enrichment_query(source_list, chunk))])
137+
s_result = self._batched_enrichment_results(
138+
object_list,
139+
lambda chunk: get_full_enrichment_query(source_list, chunk),
140+
)
146141

147142
self.enriched_df = (
148143
pd.DataFrame(s_result, columns=["s", "s_label", "p", "x", "x_label"])
@@ -172,16 +167,10 @@ def contextual_slim_enrichment(self, context: List[str]) -> pd.DataFrame:
172167
query_string = get_contextual_enrichment_query(context)
173168
source_list = [term.get_iri() for term in self._term_list]
174169
object_list = list(set(source_list + [res.get("term") for res in run_sparql_query(query_string)]))
175-
s_result = []
176-
for chunk in chunks(object_list, 90):
177-
s_result.extend(
178-
[
179-
res
180-
for res in run_sparql_query(
181-
get_simple_enrichment_query(source_list, chunk, self._enrichment_property_list)
182-
)
183-
]
184-
)
170+
s_result = self._batched_enrichment_results(
171+
object_list,
172+
lambda chunk: get_simple_enrichment_query(source_list, chunk, self._enrichment_property_list),
173+
)
185174

186175
self.enriched_df = (
187176
pd.DataFrame(s_result, columns=["s", "s_label", "p", "o", "o_label"])
@@ -219,16 +208,10 @@ def ancestor_enrichment(self, step_count: int) -> pd.DataFrame:
219208
source_list = [term.get_iri() for term in self._term_list]
220209
query_string = get_ancestor_enrichment_query(source_list, step_count)
221210
object_list = list(set(uri for res in run_sparql_query(query_string) for uri in res.values()))
222-
s_result = []
223-
for chunk in chunks(object_list, 90):
224-
s_result.extend(
225-
[
226-
res
227-
for res in run_sparql_query(
228-
get_simple_enrichment_query(source_list, chunk, self._enrichment_property_list)
229-
)
230-
]
231-
)
211+
s_result = self._batched_enrichment_results(
212+
object_list,
213+
lambda chunk: get_simple_enrichment_query(source_list, chunk, self._enrichment_property_list),
214+
)
232215

233216
self.enriched_df = (
234217
pd.DataFrame(s_result, columns=["s", "s_label", "p", "o", "o_label"])
@@ -365,15 +348,13 @@ def mirror_enrichment_for_graph_generation(self, term_list: List[str]) -> None:
365348
# TODO definitely need a refactoring later on
366349
s_result = []
367350
for s_chunk in chunks(term_list, 45):
368-
for o_chunk in chunks(term_list, 45):
369-
s_result.extend(
370-
[
371-
res
372-
for res in run_sparql_query(
373-
get_simple_enrichment_query(s_chunk, o_chunk, self._enrichment_property_list)
374-
)
375-
]
351+
s_result.extend(
352+
self._batched_enrichment_results(
353+
term_list,
354+
lambda o_chunk: get_simple_enrichment_query(s_chunk, o_chunk, self._enrichment_property_list),
355+
chunk_size=45,
376356
)
357+
)
377358
self.graph_df = (
378359
pd.DataFrame(s_result, columns=["s", "s_label", "p", "o", "o_label"])
379360
.sort_values("s")
@@ -385,3 +366,16 @@ def _generate_enrichment_graph(self, object_list: List[str]) -> None:
385366
self.mirror_enrichment_for_graph_generation(object_list)
386367
self.graph = GraphGenerator.generate_enrichment_graph(self.graph_df)
387368
self.graph = GraphGenerator.apply_transitive_reduction(self.graph, self.enriched_df["p"].unique().tolist())
369+
370+
def _batched_enrichment_results(
371+
self,
372+
object_list: List[str],
373+
query_builder,
374+
chunk_size: int = 90,
375+
):
376+
"""Execute enrichment queries in batches to avoid oversized SPARQL VALUES blocks."""
377+
results = []
378+
for chunk in chunks(object_list, chunk_size):
379+
query_string = query_builder(chunk)
380+
results.extend([res for res in run_sparql_query(query_string)])
381+
return results

0 commit comments

Comments
 (0)