Added pnc restoration

ssh-meister · ssh-meister · commit e168554f58e2 · 2025-03-17T11:51:54.000Z
Signed-off-by: Sasha Meister &lt;sasha.meister.work@gmail.com&gt;
diff --git a/dataset_configs/multilingual/yodas2/prompts/pnc_restoration/en.yaml b/dataset_configs/multilingual/yodas2/prompts/pnc_restoration/en.yaml
@@ -0,0 +1,53 @@
+system: |
+  Description: 
+    You have a transcript that may contain punctuation and capitalization, may not contain them, or may contain incorrect punctuation. The task is to bring the text to the correct form by restoring punctuation and capitalization, ensuring the following rules:
+
+  Rules:
+    - "Do not change, add, or remove any words in the text. All modifications should be limited to punctuation and capitalization."
+    - "Restore the correct punctuation using only periods, commas, and question marks. All other symbols (including exclamation marks, colons, semicolons, quotes, parentheses, emojis, etc.) must be removed or replaced with allowed punctuation marks."
+    - "If the text already contains sufficient punctuation (periods, commas, and question marks), it should remain unchanged."
+    - "If punctuation is incomplete, incorrect, or contains invalid symbols (e.g., exclamation marks, ellipses, or other unnecessary symbols), it should be corrected to the proper form using only periods, commas, and question marks."
+    - "Punctuation must match the context: if the sentence is a question, use a question mark at the end. In other cases, use a period or comma if needed to separate parts of the sentence."
+    - "All alphanumeric characters (including digits, e.g., 3:30pm) should remain unchanged."
+    - "Capitalize the first letter of each sentence."
+    - "Capitalize proper nouns and abbreviations."
+    - "If the text starts in the middle of a sentence or ends in the middle of a word, do not capitalize the first letter or add a period at the end."
+    - "If punctuation is missing or incorrect, replace invalid symbols with valid punctuation (period, comma, or question mark) without changing the meaning of the text."
+
+  Examples:
+  - input: "the quick brown fox jumped over the lazy dog"
+    output: "The quick brown fox jumped over the lazy dog."
+
+  - input: "hello how are you today :-) I hope you're doing well :)"
+    output: "Hello, how are you today? I hope you're doing well."
+
+  - input: "She went to the store; then she bought some bread."
+    output: "She went to the store, then she bought some bread."
+
+  - input: "I can't believe this...!!! This is so exciting!!!"
+    output: "I can't believe this. This is so exciting."
+
+  - input: "Do you know where the keys are I can't find them anywhere"
+    output: "Do you know where the keys are? I can't find them anywhere."
+
+  - input: "the meeting is at 3:30pm, we should prepare by 3:00."
+    output: "The meeting is at 3:30pm, we should prepare by 3:00."
+
+  - input: "this is a great idea, but we need more details."
+    output: "This is a great idea, but we need more details."
+
+  - input: "my friend, john, is visiting new york next week."
+    output: "My friend, John, is visiting New York next week."
+
+  - input: "we need to finish the project by friday, but I am not sure about the deadline yet."
+    output: "We need to finish the project by Friday, but I am not sure about the deadline yet."
+
+  - input: "the report was almost done, but"
+    output: "The report was almost done, but"
+  
+user: |
+  Input transcript: {pred_text}
+
+generation: |
+  Output transcript: 
+
diff --git a/sdp/processors/huggingface/transformers/models.py b/sdp/processors/huggingface/transformers/models.py
@@ -0,0 +1,76 @@
+import yaml
+import json
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
+
+from sdp.logging import logger
+from sdp.processors.base_processor import BaseProcessor
+
+class AutoModelForCausalLMProcessor(BaseProcessor):
+    def __init__(self, 
+                input_manifest_file: str, 
+                output_manifest_file: str,
+                prompt_file: str,
+                output_field: str = 'generation',
+                **kwargs):
+                super().__init__(
+                    input_manifest_file=input_manifest_file,
+                    output_manifest_file=output_manifest_file,
+                    )
+                
+                self.prompt_file = prompt_file
+                self.prompt = None
+                
+                self.cfg = kwargs['model']
+                self.model_cfg = AutoConfig.from_pretrained(**self.cfg)
+
+                self.output_field = output_field
+    
+    def read_prompt_file(self):
+        with open(self.prompt_file, 'r') as prompt: 
+            self.prompt = yaml.safe_load(prompt)
+
+    def build_entry_prompt(self, data_entry):
+        entry_prompt = []
+        for role in self.prompt:
+            entry_prompt.append(dict(
+                role=role,
+                content=self.prompt[role].format(**data_entry)
+            ))
+        return entry_prompt
+
+    def process(self):
+        logger.info(f'Reading prompt: ')
+        self.read_prompt_file()
+        logger.info(f'Prompt:\n{yaml.dump(self.prompt, default_flow_style=False)}\n')
+
+        logger.info(f'Loading model:')
+        model = AutoModelForCausalLM.from_config(self.model_cfg)
+        tokenizer = AutoTokenizer.from_pretrained(self.cfg.pretrained_model_name_or_path)
+        
+        with open(self.input_manifest_file, 'r', encoding='utf8') as fin, open(self.output_manifest_file, 'w', encoding='utf8') as fout:
+            for line in tqdm(fin, desc = "Generation: "):
+                data_entry = json.loads(line)
+                entry_prompt = self.build_entry_prompt(data_entry)
+                text = tokenizer.apply_chat_template(
+                    entry_prompt,
+                    tokenize=False,
+                    add_generation_prompt=True
+                )
+
+                model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+
+                generated_ids = model.generate(
+                    **model_inputs,
+                    max_new_tokens=512
+                )
+
+                generated_ids = [
+                    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+                ]
+
+                response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+
+                data_entry[self.output_field] = response
+                line = json.dumps(data_entry)
+                fout.writelines(f'{line}\n')