Added CharacterHistograms processor

ssh-meister · ssh-meister · commit 14e0ce38f85d · 2025-04-10T12:54:52.000Z
Signed-off-by: Sasha Meister &lt;sasha.meister.work@gmail.com&gt;
diff --git a/dataset_configs/multilingual/yodas2/config.yaml b/dataset_configs/multilingual/yodas2/config.yaml
@@ -13,6 +13,9 @@ filters:
 translation:
   source_lang: English
   target_lang: Italian
+  filters:
+    max_len_diff_ratio: 4
+    max_hist_token_ratio: 0.8
 
 processors:
   - _target_: sdp.processors.datasets.yodas2.ListYodas2Data
@@ -232,19 +235,74 @@ processors:
       max_length: 512
       tokenize: False
       add_generation_prompt: True
+    
+  - _target_: sdp.processors.CountNumWords
+    output_manifest_file: ${workspace_dir}/${filters.source_lang}/manifest_28.json
+    text_key: pred_text
+    num_words_key: num_words_src
+  
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/${filters.source_lang}/manifest_29.json
+    input_value_key: num_words_src
+    operator: gt
+    target_value: 1
+  
+  - _target_: sdp.processors.CountNumWords
+    output_manifest_file: ${workspace_dir}/${filters.source_lang}/manifest_30.json
+    text_key: generation
+    num_words_key: num_words_tgt
+  
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/${filters.source_lang}/manifest_31.json
+    input_value_key: num_words_tgt
+    operator: gt
+    target_value: 1
+  
+  - _target_: sdp.processors.LambdaExpression
+    output_manifest_file: ${workspace_dir}/${filters.source_lang}/manifest_32.json
+    new_field: 'len_diff_ratio'
+    expression: max(entry.num_words_src - entry.num_words_tgt, entry.num_words_tgt - entry.num_words_src)
+  
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/${filters.source_lang}/manifest_33.json
+    input_value_key: len_diff_ratio
+    operator: lt
+    target_value: ${translation.filters.max_len_diff_ratio}
+  
+  - _target_: sdp.processors.CharacterHistograms
+    output_manifest_file: ${workspace_dir}/${filters.source_lang}/manifest_34.json
+    text_field: pred_text
+    lang: ${filters.source_lang}
+    output_score_field: hist_token_ratio_pred_text
+    cache_dir: ""
+
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/${filters.source_lang}/manifest_35.json
+    input_value_key: hist_token_ratio_pred_text
+    operator: lt
+    target_value: ${translation.filters.max_hist_token_ratio}
+  
+  - _target_: sdp.processors.CharacterHistograms
+    output_manifest_file: ${workspace_dir}/${filters.source_lang}/manifest_34.json
+    text_field: generation
+    lang: it #${filters.source_lang}
+    output_score_field: hist_token_ratio_generation
+    cache_dir: ""
+
+  - _target_: sdp.processors.PreserveByValue
+    output_manifest_file: ${workspace_dir}/${filters.source_lang}/manifest_35.json
+    input_value_key: hist_token_ratio_generation
+    operator: lt
+    target_value: ${translation.filters.max_hist_token_ratio}
 
   - _target_: sdp.processors.CometoidWMTQualityEstimation
     output_manifest_file: ${workspace_dir}/${filters.source_lang}/manifest_28.json
     source_text_field: pred_text #source
     target_text_field: generation #target
     model_name_or_path: cometoid-wmt23
-    device_type: gou
+    device_type: gpu
     num_devices: 4
     chunksize: 10
-
-
-
-
   
   
 
diff --git a/sdp/processors/metrics/text.py b/sdp/processors/metrics/text.py
@@ -13,6 +13,13 @@
 # limitations under the License.
 
 import re
+import os
+import tempfile
+import shutil
+import requests
+import wget
+import tarfile
+from tqdm import tqdm
 
 from sdp.logging import logger
 from sdp.processors.base_processor import BaseParallelProcessor, DataEntry
@@ -52,4 +59,93 @@ def process_dataset_entry(self, data_entry):
         words = cleaned_string.split()
         num_words = len(words)
         data_entry[self.num_words_key] = num_words
+        return [DataEntry(data=data_entry)]
+
+
+class CharacterHistograms(BaseParallelProcessor):
+    def __init__(self,
+                 text_field: str,
+                 lang_field: str = None,
+                 lang: str = None,
+                 threshold: float = 0.8,
+                 cache_dir: str = None,
+                 threshold_char: str = "]",
+                 output_score_field: str = "hist_token_ratio",
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.text_field = text_field
+        
+        if lang_field is None and lang is None: 
+            raise ValueError("One of the arguments `lang` or `lang_field` must be provided.")
+                
+        if lang_field is not None and lang is not None: 
+            raise ValueError(
+                f"Both `lang` ({lang}) and `lang_field` ({lang_field}) are provided, which makes the source of language ambiguous. Please provide only one of them."
+            )
+        
+        self.text_field = text_field
+        self.lang_field = lang_field
+        self.lang = lang
+        self.threshold = threshold
+        self.cache_dir = cache_dir
+        self.threshold_char = threshold_char
+        self.output_score_field = output_score_field
+        self.histograms = dict()
+
+    def _read_hist(self, lang: str):
+        hist_file = os.path.join(self.cache_dir, lang)
+        chars = []
+        with open(hist_file) as hist:
+            for line in hist:
+                char = line[0] 
+                chars.append(char)
+                if char == self.threshold_char:
+                    break
+        self.histograms[lang] =set(chars)
+    
+    def prepare(self):
+        if self.cache_dir is None:
+            self.cache_dir = tempfile.mkdtemp()
+
+        os.makedirs(self.cache_dir, exist_ok=True)
+        
+        if not os.path.exists(self.cache_dir):
+            logger.info(f'Downloading histograms to {self.cache_dir}')
+            histograms_url = 'https://dl.fbaipublicfiles.com/m2m_100/histograms.tar.gz'
+            response = requests.get(histograms_url)
+
+            if response.status_code != 200:
+                raise requests.exceptions.RequestException(
+                f"Failed to download histogram file. Status code: {response.status_code}"
+            )
+            
+            histograms_tarfile = wget.download(histograms_url, out=self.cache_dir)
+            with tarfile.open(histograms_tarfile, "r:gz") as tar:
+                tar.extractall(path=self.cache_dir)
+            
+            self.cache_dir = os.path.join(self.cache_dir, "checkpoint/edunov/cc60_multilingual/clean_hists")
+            logger.info(f'Histograms are downloaded.')
+
+        logger.info(f'Reading histograms')
+        available_langs = os.listdir(self.cache_dir)
+        if self.lang is not None:
+            if self.lang in available_langs:
+                self._read_hist(self.lang)
+            else:
+                raise ValueError(f"Invalid value for `lang`: {self.lang}. Please provide one of the following: {available_langs}")
+            logger.info(f'Histogram for `{self.lang}` has been read.')
+        else:
+            for lang in tqdm(available_langs):
+                self._read_hist(lang)
+            logger.info(f'Histograms have been read.')
+        
+    def process_dataset_entry(self, data_entry):
+        lang = self.lang if self.lang is not None else data_entry[self.lang_field]
+        if lang not in self.histograms:
+            raise ValueError(f'lang `{lang} is not supported.')
+
+        text = data_entry[self.text_field].strip()
+        cnt = len([c for c in text if c in self.histograms[lang]])
+        token_ratio = 1 if cnt / len(text) > self._threshold else 0
+        data_entry[self.output_score_field] = token_ratio
         return [DataEntry(data=data_entry)]