Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/src/sdp/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ Data modifications
:annotation:
:noindex:

.. autodata:: sdp.processors.SearchRegex
:annotation:

.. autodata:: sdp.processors.SubMakeLowercase
:annotation:

Expand Down
1 change: 1 addition & 0 deletions sdp/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
SubIfASRSubstitution,
SubMakeLowercase,
SubRegex,
SearchRegex,
FfmpegConvert,
)
from sdp.processors.modify_manifest.data_to_dropbool import (
Expand Down
50 changes: 49 additions & 1 deletion sdp/processors/modify_manifest/data_to_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,4 +577,52 @@ def finalize(self, metrics):
total_counter_sorted = dict(sorted(total_counter.items(), key=lambda x: x[1], reverse=True))
for word, count in total_counter_sorted.items():
logger.info(f"{word} {count}")
super().finalize(metrics)

super().finalize(metrics)


class SearchRegex(BaseParallelProcessor):
"""Searches for patterns in the input string.

Args:
search_patterns (list[str]): List of search patterns.
text_key (str): Key in the data entry containing the text to search.
output_key (str): Key in the data entry to store the output value indicating if any pattern has been found.
"""

def __init__(
self,
search_patterns: List[str],
text_key: str = "text",
output_key: str = "pattern_found",
**kwargs,
):
super().__init__(**kwargs)
self.search_patterns = search_patterns
self.text_key = text_key
self.output_key = output_key

def process_dataset_entry(self, data_entry) -> List:
"""Searches for each pattern in the input text."""
search_results = {}

text_in = data_entry[self.text_key]
pattern_found = False

for pattern in self.search_patterns:
found = bool(re.search(pattern, text_in))
search_results[pattern] = found
if found:
pattern_found = True

data_entry[self.output_key] = pattern_found

return [DataEntry(data=data_entry, metrics=pattern_found)]

def finalize(self, metrics):
"""Reports counts of how many data entries contained patterns."""
print(f"Samples amount which contain patterns: {sum(metrics)}")
print(f"Samples amount which don't contain patterns: {len(metrics) - sum(metrics)}")

super().finalize(metrics)

11 changes: 11 additions & 0 deletions tests/test_data_to_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
SubIfASRSubstitution,
SubMakeLowercase,
SubRegex,
SearchRegex
)

test_params_list = []
Expand Down Expand Up @@ -90,6 +91,16 @@
]
)

test_params_list.extend(
[
(
SearchRegex,
{"search_patterns": ["[^a-zA-Z\\s]+"]},
{"text": "Hola, bienvenido seas a este Canal de Ministerio Latino por Cristo."},
{"text": "Hola, bienvenido seas a este Canal de Ministerio Latino por Cristo.", "pattern_found": True},
),
]
)

@pytest.mark.parametrize("test_class,class_kwargs,test_input,expected_output", test_params_list, ids=str)
def test_data_to_data(test_class, class_kwargs, test_input, expected_output):
Expand Down