Skip to content

Commit 1e5b9af

Browse files
authored
Merge pull request #329 from dmnc-grdnr/feature/328-remove-annotations
#328 - Remove annotations in a specified range
2 parents 991e7a1 + 7ae151f commit 1e5b9af

File tree

3 files changed

+435
-2
lines changed

3 files changed

+435
-2
lines changed

cassis/cas.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
FEATURE_BASE_NAME_HEAD,
1616
FEATURE_BASE_NAME_LANGUAGE,
1717
TYPE_NAME_DOCUMENT_ANNOTATION,
18+
TYPE_NAME_ANNOTATION,
1819
TYPE_NAME_FS_ARRAY,
1920
TYPE_NAME_FS_LIST,
2021
TYPE_NAME_SOFA,
@@ -367,6 +368,78 @@ def add_annotations(self, annotations: Iterable[FeatureStructure]):
367368
"""
368369
self.add_all(annotations)
369370

371+
def crop_sofa_string(self, sofa_begin: int, sofa_end: int, overlap: bool = True):
372+
"""Replaces current sofa string with a cutout of the given range. Removes all annotations outside of range,
373+
but keeps annotations that overlap with cutout points by default.
374+
375+
Args:
376+
sofa_begin: The beginning of the cutout sofa.
377+
sofa_end: The end of the cutout sofa.
378+
overlap: If true, keeps overlapping annotations and modifies begin and end of annotation accordingly.
379+
380+
Raises:
381+
ValueError: If cutout indices are invalid.
382+
Note:
383+
Removal performed by this method only removes annotations from the current view's
384+
index. Feature structures that are removed from the view remain in memory and any
385+
references from kept annotations to those feature structures are left intact. Such
386+
transitively referenced feature structures will still be discovered by traversal
387+
(e.g. ``_find_all_fs()``) and included during serialization.
388+
389+
Important: only the annotations that are kept (inside the cut or overlapping
390+
the cut boundaries) have their ``begin``/``end`` offsets adjusted to the new
391+
sofa coordinate space. Feature structures that are removed from the view are
392+
not re-anchored or relocated — they keep their original ``begin``/``end``
393+
values. As a result, serializers may attempt to transcode offsets that fall
394+
outside the new sofa range; the offset converter will emit ``UserWarning``
395+
messages for unmappable offsets but will not raise an exception. If you
396+
require a cascading delete or re-anchoring of transitively referenced feature
397+
structures, perform an explicit graph traversal and removal or implement an
398+
opt-in ``cascade=True`` behavior.
399+
"""
400+
if self.sofa_string is None:
401+
raise ValueError("Cannot crop sofa string: CAS has no sofa string for the current view")
402+
403+
if 0 <= sofa_begin < sofa_end <= len(self.sofa_string):
404+
self.sofa_string = self.sofa_string[sofa_begin:sofa_end]
405+
# Make an explicit snapshot of the current annotations to avoid
406+
# issues when removing/modifying elements during iteration.
407+
for annotation in list(self.select_all()):
408+
# Determine whether the annotation will be kept and how its
409+
# offsets need to be adjusted. If offsets are adjusted we must
410+
# reindex the annotation (remove then add) so that the
411+
# underlying SortedKeyList remains correctly ordered by the
412+
# updated begin/end values.
413+
if sofa_begin <= annotation.begin and annotation.end <= sofa_end:
414+
# fully contained
415+
self._current_view.remove_annotation_from_index(annotation)
416+
annotation.begin = annotation.begin - sofa_begin
417+
annotation.end = annotation.end - sofa_begin
418+
self._current_view.add_annotation_to_index(annotation)
419+
elif overlap and sofa_begin < annotation.end <= sofa_end:
420+
# left overlap (annotation starts before cut)
421+
self._current_view.remove_annotation_from_index(annotation)
422+
annotation.begin = 0
423+
annotation.end = annotation.end - sofa_begin
424+
self._current_view.add_annotation_to_index(annotation)
425+
elif overlap and sofa_begin <= annotation.begin < sofa_end:
426+
# right overlap (annotation ends after cut)
427+
self._current_view.remove_annotation_from_index(annotation)
428+
annotation.begin = annotation.begin - sofa_begin
429+
annotation.end = len(self.sofa_string)
430+
self._current_view.add_annotation_to_index(annotation)
431+
elif overlap and annotation.begin <= sofa_begin and sofa_end <= annotation.end:
432+
# annotation fully covers the cut
433+
self._current_view.remove_annotation_from_index(annotation)
434+
annotation.begin = 0
435+
annotation.end = len(self.sofa_string)
436+
self._current_view.add_annotation_to_index(annotation)
437+
else:
438+
# annotation falls completely outside the cut; remove it
439+
self.remove(annotation)
440+
else:
441+
raise ValueError(f"Invalid indices for begin {sofa_begin} and end {sofa_end}")
442+
370443
def remove(self, annotation: FeatureStructure):
371444
"""Removes an annotation from an index. This throws if the
372445
annotation was not present.
@@ -386,6 +459,38 @@ def remove_annotation(self, annotation: FeatureStructure):
386459
"""
387460
self.remove(annotation)
388461

462+
def remove_annotations_in_range(self, begin: int, end: int, type_: Optional[Union[Type, str]] = None):
463+
"""Removes annotations between two indices of the sofa string.
464+
465+
Args:
466+
begin: The beginning of the cutting interval.
467+
end: The end of the cutting interval.
468+
type_: The type or name of the type name whose annotation instances are to be found
469+
Raises:
470+
ValueError: If range indices are invalid.
471+
"""
472+
473+
# If no type is provided, operate on annotation-like feature
474+
# structures only (those that have `begin` and `end`) to avoid
475+
# AttributeError for arbitrary FS (e.g., instances of uima.cas.TOP).
476+
if type_ is None:
477+
# Only operate on annotation-like feature structures to avoid
478+
# AttributeError for non-annotation FS present in the view.
479+
annotations = [a for a in self.select_all() if self.typesystem.is_instance_of(a.type, TYPE_NAME_ANNOTATION)]
480+
else:
481+
annotations = self.select(type_)
482+
if self.sofa_string is None:
483+
raise ValueError("Cannot remove annotations by range: CAS has no sofa string for the current view")
484+
485+
if 0 <= begin < end <= len(self.sofa_string):
486+
# Make an explicit snapshot of the annotations to avoid issues when
487+
# removing elements during iteration (defensive copy).
488+
for annotation in list(annotations):
489+
if begin <= annotation.begin < annotation.end <= end:
490+
self.remove(annotation)
491+
else:
492+
raise ValueError(f"Invalid indices for begin {begin} and end {end}")
493+
389494
@deprecation.deprecated(details="Use annotation.get_covered_text()")
390495
def get_covered_text(self, annotation: FeatureStructure) -> str:
391496
"""Gets the text that is covered by `annotation`.

0 commit comments

Comments
 (0)