Skip to content

Commit 7de1d46

Browse files
Creating per-annotation-type parameters
1 parent b1a43ee commit 7de1d46

File tree

6 files changed

+72
-41
lines changed

6 files changed

+72
-41
lines changed

modules/accelerators/contextualization/cdf_file_annotation/extraction_pipelines/ep_file_annotation.config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,8 @@ config:
149149
operator: In
150150
targetProperty: tags
151151
limit: 500 # Number of edges to process per batch
152+
promoteFileEntities: True # Controls if promote is run on file entities
153+
promoteTargetEntities: True # Controls if promote is run on target entities
152154
deleteRejectedEdges: True
153155
deleteSuggestedEdges: True
154156
entitySearchService:

modules/accelerators/contextualization/cdf_file_annotation/functions/fn_file_annotation_finalize/services/ConfigService.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,8 @@ class PromoteFunctionConfig(BaseModel, alias_generator=to_camel):
265265
get_candidates_query: QueryConfig | list[QueryConfig]
266266
delete_rejected_edges: bool
267267
delete_suggested_edges: bool
268+
promote_file_entities: bool = True
269+
promote_target_entities: bool = True
268270
entity_search_service: EntitySearchServiceConfig
269271

270272

modules/accelerators/contextualization/cdf_file_annotation/functions/fn_file_annotation_launch/services/ConfigService.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,8 @@ class PromoteFunctionConfig(BaseModel, alias_generator=to_camel):
265265
get_candidates_query: QueryConfig | list[QueryConfig]
266266
delete_rejected_edges: bool
267267
delete_suggested_edges: bool
268+
promote_file_entities: bool = True
269+
promote_target_entities: bool = True
268270
entity_search_service: EntitySearchServiceConfig
269271

270272

modules/accelerators/contextualization/cdf_file_annotation/functions/fn_file_annotation_prepare/services/ConfigService.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,8 @@ class PromoteFunctionConfig(BaseModel, alias_generator=to_camel):
265265
get_candidates_query: QueryConfig | list[QueryConfig]
266266
delete_rejected_edges: bool
267267
delete_suggested_edges: bool
268+
promote_file_entities: bool = True
269+
promote_target_entities: bool = True
268270
entity_search_service: EntitySearchServiceConfig
269271

270272

modules/accelerators/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/ConfigService.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,8 @@ class PromoteFunctionConfig(BaseModel, alias_generator=to_camel):
266266
delete_rejected_edges: bool
267267
delete_suggested_edges: bool
268268
entity_search_service: EntitySearchServiceConfig
269+
promote_file_entities: bool = True
270+
promote_target_entities: bool = True
269271

270272

271273
class DataModelViews(BaseModel, alias_generator=to_camel):

modules/accelerators/contextualization/cdf_file_annotation/functions/fn_file_annotation_promote/services/PromoteService.py

Lines changed: 62 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ def __init__(
9292
# Promote flags
9393
self.delete_rejected_edges: bool = self.config.promote_function.delete_rejected_edges
9494
self.delete_suggested_edges: bool = self.config.promote_function.delete_suggested_edges
95+
self.promote_file_entities: bool = self.config.promote_function.promote_file_entities
96+
self.promote_target_entities: bool = self.config.promote_function.promote_target_entities
9597

9698
# Injected service dependencies
9799
self.entity_search_service = entity_search_service
@@ -149,11 +151,19 @@ def run(self) -> Literal["Done"] | None:
149151
grouped_candidates[key] = []
150152
grouped_candidates[key].append(edge)
151153

154+
grouped_by_type: dict[str, dict[str, list[Edge]]] = {}
155+
156+
for (text_to_find, annotation_type), edges_with_same_text in grouped_candidates.items():
157+
grouped_by_type.setdefault(annotation_type, {})[text_to_find] = edges_with_same_text
158+
159+
total_grouped = sum(len(m) for m in grouped_by_type.values())
160+
152161
self.logger.info(
153-
message=f"Grouped {len(candidates)} candidates into {len(grouped_candidates)} unique text/type combinations.",
162+
message=f"Grouped {len(candidates)} candidates into {total_grouped} unique text/type combinations across {len(grouped_by_type)} types.",
154163
)
164+
155165
self.logger.debug(
156-
message=f"Deduplication savings: {len(candidates) - len(grouped_candidates)} queries avoided.",
166+
message=f"Deduplication savings: {len(candidates) - total_grouped} queries avoided.",
157167
section="END",
158168
)
159169

@@ -170,54 +180,65 @@ def run(self) -> Literal["Done"] | None:
170180

171181
try:
172182
# Process each unique text/type combination once
173-
for (text_to_find, annotation_type), edges_with_same_text in grouped_candidates.items():
183+
# Iterate per annotation type so we can check the search flag once per type
184+
for annotation_type, texts_map in grouped_by_type.items():
185+
# Determine entity space once per type
174186
entity_space: str | None = (
175187
self.file_view.instance_space
176188
if annotation_type == "diagrams.FileLink"
177189
else self.target_entities_view.instance_space
178190
)
179191

180-
# NOTE: This occurs when no instance space is set in the data model views section extraction pipelines config file
181192
if not entity_space:
182-
self.logger.warning(
183-
f"Could not determine entity space for type '{annotation_type}'.\nPlease ensure an instance space is set in the Files and Target Entities data model views section of the extraction pipeline configuration.\nSkipping."
184-
)
193+
self.logger.warning(f"Could not determine entity space for type '{annotation_type}'. Skipping all texts for this type.")
185194
continue
186195

187-
# Strategy: Check cache → query edges → fallback to global search
188-
found_nodes: list[Node] | list = self._find_entity_with_cache(
189-
text_to_find, annotation_type, entity_space
190-
)
191-
192-
# Determine result type for tracking AND deletion decision
193-
num_edges: int = len(edges_with_same_text)
194-
should_delete: bool = False
195-
196-
if len(found_nodes) == 1:
197-
batch_promoted += num_edges
198-
should_delete = False # Never delete promoted edges
199-
elif len(found_nodes) == 0:
200-
batch_rejected += num_edges
201-
should_delete = self.delete_rejected_edges
202-
else: # Multiple matches
203-
batch_ambiguous += num_edges
204-
should_delete = self.delete_suggested_edges
205-
206-
# Apply the same result to ALL edges with this text
207-
for edge in edges_with_same_text:
208-
edge_apply, raw_row = self._prepare_edge_update(edge, found_nodes)
209-
210-
if should_delete:
211-
# Delete the edge but still update RAW row to track what happened
212-
edges_to_delete.append(EdgeId(edge.space, edge.external_id))
213-
if raw_row is not None:
214-
raw_rows_to_update.append(raw_row)
215-
else:
216-
# Update both edge and RAW row
217-
if edge_apply is not None:
218-
edges_to_update.append(edge_apply)
219-
if raw_row is not None:
220-
raw_rows_to_update.append(raw_row)
196+
if annotation_type == "diagrams.FileLink":
197+
is_searching_annotation_type = self.promote_file_entities
198+
else:
199+
is_searching_annotation_type = self.promote_target_entities
200+
201+
if not is_searching_annotation_type:
202+
self.logger.info(f"Search disabled for annotation type '{annotation_type}'. It will reject those edges without searching ({len(texts_map)} nodes).", section="START")
203+
204+
for text_to_find, edges_with_same_text in texts_map.items():
205+
# Strategy: Check cache → query edges → fallback to global search
206+
found_nodes: list[Node] | list = []
207+
208+
if is_searching_annotation_type:
209+
found_nodes = self._find_entity_with_cache(
210+
text_to_find, annotation_type, entity_space
211+
)
212+
213+
# Determine result type for tracking AND deletion decision
214+
num_edges: int = len(edges_with_same_text)
215+
should_delete: bool = False
216+
217+
if len(found_nodes) == 1:
218+
batch_promoted += num_edges
219+
should_delete = False # Never delete promoted edges
220+
elif len(found_nodes) == 0:
221+
batch_rejected += num_edges
222+
should_delete = self.delete_rejected_edges
223+
else: # Multiple matches
224+
batch_ambiguous += num_edges
225+
should_delete = self.delete_suggested_edges
226+
227+
# Apply the same result to ALL edges with this text
228+
for edge in edges_with_same_text:
229+
edge_apply, raw_row = self._prepare_edge_update(edge, found_nodes)
230+
231+
if should_delete:
232+
# Delete the edge but still update RAW row to track what happened
233+
edges_to_delete.append(EdgeId(edge.space, edge.external_id))
234+
if raw_row is not None:
235+
raw_rows_to_update.append(raw_row)
236+
else:
237+
# Update both edge and RAW row
238+
if edge_apply is not None:
239+
edges_to_update.append(edge_apply)
240+
if raw_row is not None:
241+
raw_rows_to_update.append(raw_row)
221242
finally:
222243
# Update tracker with batch results
223244
self.tracker.add_edges(promoted=batch_promoted, rejected=batch_rejected, ambiguous=batch_ambiguous)

0 commit comments

Comments
 (0)