@@ -405,36 +405,67 @@ def finalize(self, metrics):
405405
406406
407407class SubRegex (BaseParallelProcessor ):
408- """Converts a regex match to a string, as defined by key-value pairs in ``regex_to_sub``.
408+ """
409+ Applies a sequence of regex substitutions to the specified text field in each data entry.
410+
411+ This processor performs regex-based substitutions as defined in either a provided list of
412+ regex parameter dictionaries or a YAML configuration file. Each substitution is applied in
413+ the order specified.
409414
410- Before applying regex changes, we will add a space
411- character to the beginning and end of the ``text`` and ``pred_text``
412- keys for each data entry. After the the regex changes,
413- the extra spaces are removed. This includes the spaces in the beginning
414- and end of the text, as well as any double spaces ``" "``.
415+ Before substitutions are applied, a space is temporarily added to the beginning and end of the text
416+ to improve regex match consistency. After all substitutions, leading/trailing spaces and repeated
417+ spaces are removed.
415418
416419 Args:
417- regex_params_list (list[dict]): list of dicts.
418- Each dict must contain a ``pattern`` and a ``repl`` key,
419- and optionally a ``count`` key (by default, ``count`` will be 0).
420- This processor will go through the list in order, and apply a ``re.sub`` operation on
421- the input text in ``data_entry[self.text_key]``, feeding in the specified ``pattern``, ``repl``
422- and ``count`` parameters to ``re.sub``.
423- text_key (str): a string indicating which key of the data entries
424- should be used to find the utterance transcript. Defaults to "text".
420+ regex_params_list (List[Dict], optional): A list of dictionaries specifying the regex substitutions.
421+ Each dictionary must include::
422+
423+ - "pattern": A regex pattern to match.
424+ - "repl": A replacement string.
425+ - "count" (optional): Maximum number of replacements to make. Defaults to 0 (replace all).
426+
427+ regex_params_yaml (str, optional): Path to a YAML file that defines the same list of dictionaries
428+ as `regex_params_list`. Either `regex_params_list` or `regex_params_yaml` must be provided.
429+ If both are provided, `regex_params_yaml` takes precedence.
430+
431+ text_key (str): The key in each data entry whose value will be modified. Defaults to "text".
432+
433+ **kwargs: Additional arguments passed to the BaseParallelProcessor.
434+
435+ Example YAML format for `regex_params_yaml`:
436+ ```
437+ # regex_params.yaml
438+ - {"pattern": "♩", "repl": " "}
439+ - {"pattern": "♭", "repl": " "}
440+ - {"pattern": "\\ |", "repl": " "}
441+ - {"pattern": ":", "repl": " "}
442+ - {"pattern": "-", "repl": " "}
443+ - {"pattern": "[^ €₽₴$£%?!',.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя]", "repl": ""}
444+ - {"pattern": "\\ s+\\ .", "repl": "."}
445+ - {"pattern": "\\ ?+", "repl": "?"}
446+ - {"pattern": "\\ .+", "repl": "."}
447+ ```
425448
426449 Returns:
427- The same data as in the input manifest with ``<text_key>`` field changed.
450+ The same data as in the input manifest with ``<text_key>`` field changed.
428451 """
429452
430453 def __init__ (
431454 self ,
432- regex_params_list : List [Dict ],
455+ regex_params_list : List [Dict ] = None ,
456+ regex_params_yaml : str = None ,
433457 text_key : str = "text" ,
434458 ** kwargs ,
435459 ):
436460 super ().__init__ (** kwargs )
461+ if not regex_params_list and not regex_params_yaml :
462+ raise ValueError (f'One of `regex_params_list` or `regex_params_yaml` should be provided.' )
463+
437464 self .regex_params_list = regex_params_list
465+ if regex_params_yaml :
466+ with open (regex_params_yaml , 'r' ) as regex_params_file :
467+ self .regex_params_list = yaml .safe_load (regex_params_file )
468+
438469 self .text_key = text_key
439470
440471 # verify all dicts in regex_params_list have "pattern" and "repl" keys
0 commit comments