Merge pull request #11 from TheRoadQaQ/main

MOLYHECI · web-flow · commit 8b5d57046396 · 2025-04-06T18:16:45.000+08:00
reasoning_process
diff --git a/configs/process/text_process_reasoning.yaml b/configs/process/text_process_reasoning.yaml
@@ -0,0 +1,30 @@
+model_cache_path: '../ckpt' # Path to cache models
+dependencies: [text]
+save_path: "./processed.jsonl"
+
+data:
+  text:
+    use_hf: False # Whether to use huggingface_dataset, if used, ignore the local data path below
+    dataset_name: 'yahma/alpaca-cleaned'
+    dataset_split: 'train'  
+    name: 'default' 
+    revision: null
+    data_path: 'demos/reasoning_process/math_5_samples.json'  # Local data path, supports json, jsonl, parquet formats
+    formatter: "TextFormatter" # Data loader type
+    keys: 'answer' # Key name to be processed, for sft data, it can be specified as ['instruction','input','output']
+
+processors:
+  AnswerFormatterFilter:
+    type: "default"
+  AnswerNgramFilter:
+    min_score: 0.5
+    max_score: 1.0
+    ngrams: 5
+  AnswerGroundTruthFilter: 
+    compare_method: exact # exact/math_verify/xverify
+  AnswerTokenLengthFilter:
+    max_answer_token_length: 1024
+    tokenizer_dir: '../Qwen2.5-0.5B-Instruct'
+
+  
+  
diff --git a/dataflow/process/text/__init__.py b/dataflow/process/text/__init__.py
@@ -1,3 +1,4 @@
 from .filters import *
 from .refiners import *
-from .deduplicators import *
+from .deduplicators import *
+from .reasoning import *
diff --git a/dataflow/process/text/reasoning/__init__.py b/dataflow/process/text/reasoning/__init__.py
@@ -0,0 +1,12 @@
+import sys
+from dataflow.utils.registry import LazyLoader
+
+_import_structure = {
+    "AnswerGroundTruthFilter": ("dataflow/process/text/reasoning/answer_ground_truth_filter.py", "AnswerGroundTruthFilter"),
+    "AnswerFormatterFilter": ("dataflow/process/text/reasoning/answer_formatter_filter.py", "AnswerFormatterFilter"),
+    "AnswerNgramFilter": ("dataflow/process/text/reasoning/answer_ngram_filter.py", "AnswerNgramFilter"),
+    "AnswerTokenLengthFilter": ("dataflow/process/text/reasoning/answer_token_length_filter.py", "AnswerTokenLengthFilter"),
+}
+
+sys.modules[__name__] = LazyLoader(__name__, "dataflow/process/text/reasoning", _import_structure)
+
diff --git a/dataflow/process/text/reasoning/answer_formatter_filter.py b/dataflow/process/text/reasoning/answer_formatter_filter.py
@@ -0,0 +1,37 @@
+from dataflow.core import TextFilter
+import numpy as np
+from dataflow.utils.registry import PROCESSOR_REGISTRY
+import re
+
+@PROCESSOR_REGISTRY.register()
+class AnswerFormatterFilter(TextFilter):
+    def __init__(self, args_dict: dict):
+        super().__init__(args_dict)
+        self.filter_name = 'AnswerFormatterFilter'
+
+    def is_valid_answer(answer: str) -> bool:
+        # start with "Solution:"
+        if not answer.startswith("Solution:"):
+            return False
+        
+        # check that every step start with "→" or not
+        #steps = answer.split("\n")
+        #for step in steps:
+        #    if step.strip() and not step.strip().startswith("→"):
+        #        return False
+        
+        # check final answer in \boxed{} or not 
+        if not re.search(r'\\boxed{.*}', answer):
+            return False
+        
+        return True 
+    
+    def filter_func(self, dataset):
+        indexes =  np.zeros(len(dataset)).astype(int)
+
+        for i, item in enumerate(dataset):
+            answer = item['answer']
+            if AnswerFormatterFilter.is_valid_answer(answer):
+                indexes[i] = 1
+
+        return indexes
diff --git a/dataflow/process/text/reasoning/answer_ground_truth_filter.py b/dataflow/process/text/reasoning/answer_ground_truth_filter.py
@@ -0,0 +1,231 @@
+from dataflow.core import TextFilter
+import numpy as np
+from dataflow.utils.registry import PROCESSOR_REGISTRY
+#from math_verify import parse, verify, LatexExtractionConfig
+import pandas as pd
+from tqdm import tqdm
+import logging
+import re
+from word2number import w2n
+
+# Helper Class for String Processing
+class StringProcessor:
+    """
+    A class that encapsulates various string processing functions for mathematical expressions.
+    """
+
+    @staticmethod
+    def _fix_fracs(string):
+        """
+        Fixes fraction expressions in the string, ensuring they are properly formatted as \frac{a}{b}.
+        """
+        substrs = string.split("\\frac")
+        new_str = substrs[0]
+        if len(substrs) > 1:
+            for substr in substrs[1:]:
+                new_str += "\\frac"
+                if len(substr) > 0 and substr[0] == "{":
+                    new_str += substr
+                else:
+                    if len(substr) >= 2:
+                        a, b = substr[0], substr[1]
+                        if b != "{":
+                            new_str += f"{{{a}}}{{{b}}}{substr[2:]}" if len(substr) > 2 else f"{{{a}}}{{{b}}}"
+                        else:
+                            new_str += f"{{{a}}}{b}{substr[2:]}" if len(substr) > 2 else f"{{{a}}}{b}"
+                    else:
+                        return string
+        return new_str
+
+    @staticmethod
+    def _fix_a_slash_b(string):
+        """
+        Fixes cases where a fraction is represented as a simple division (e.g., a/b) and converts it to \frac{a}{b}.
+        """
+        if len(string.split("/")) != 2:
+            return string
+        a, b = string.split("/")
+        try:
+            a, b = int(a) if "sqrt" not in a else a, int(b) if "sqrt" not in b else b
+            assert string == f"{a}/{b}"
+            return f"\\frac{{{a}}}{{{b}}}"
+        except:
+            return string
+
+    @staticmethod
+    def _fix_sqrt(string):
+        """
+        Ensures that square root expressions are properly formatted as \sqrt{...}.
+        """
+        return re.sub(r"\\sqrt(\w+)", r"\\sqrt{\1}", string)
+
+    @staticmethod
+    def convert_word_number(text: str) -> str:
+        """
+        Converts a word representation of a number to a digit.
+        """
+        try:
+            return str(w2n.word_to_num(text))
+        except:
+            return text
+
+
+# Unit Text Class to Manage Unit Texts
+class UnitTextManager:
+    """
+    A class that encapsulates unit text management to remove unwanted unit terms from strings.
+    """
+
+    def __init__(self):
+        """
+        Initializes the unit texts and their plural forms.
+        """
+        self.unit_texts = [
+            "east", "degree", "mph", "kmph", "ft", "m sqaure", "m east", "sq m", "deg", "mile", "q .", "monkey", "prime",
+            "ratio", "profit of rs", "rd", "o", "gm", "p . m", "lb", "tile", "per", "dm", "lt", "gain", "ab", "way", "west",
+            "a .", "b .", "c .", "d .", "e .", "f .", "g .", "h .", "t", "a", "h", "no change", "men", "soldier", "pie", "bc",
+            "excess", "st", "inches", "noon", "percent", "by", "gal", "kmh", "c", "acre", "rise", "a . m", "th", "π r 2", "sq",
+            "mark", "l", "toy", "coin", "sq . m", "gallon", "° f", "profit", "minw", "yr", "women", "feet", "am", "pm", "hr",
+            "cu cm", "square", "v â € ™", "are", "rupee", "rounds", "cubic", "cc", "mtr", "s", "ohm", "number", "kmph", "day",
+            "hour", "minute", "min", "second", "man", "woman", "sec", "cube", "mt", "sq inch", "mp", "∏ cm ³", "hectare",
+            "more", "sec", "unit", "cu . m", "cm 2", "rs .", "rs", "kg", "g", "month", "km", "m", "cm", "mm", "apple", "liter",
+            "loss", "yard", "pure", "year", "increase", "decrease", "d", "less", "Surface", "litre", "pi sq m", "s .", "metre",
+            "meter", "inch",
+        ]
+        self.unit_texts.extend([t + "s" for t in self.unit_texts])
+
+    def clean_units(self, string: str):
+        """
+        Cleans the string by removing unit terms from it.
+        """
+        for unit_text in self.unit_texts:
+            string = re.sub(r"(^|\W)" + unit_text + r"($|\W)", r"\1\2", string)
+        return string
+
+
+# Main String Processing Class
+class StringCleaner:
+    """
+    A class responsible for cleaning and formatting strings in mathematical expressions.
+    """
+
+    def __init__(self, unit_manager: UnitTextManager):
+        """
+        Initializes the StringCleaner class with a unit manager.
+        """
+        self.unit_manager = unit_manager
+
+    def strip_string(self, string, skip_unit=False):
+        """
+        Strips unwanted characters and units from the string.
+        """
+        string = str(string).strip().replace("\n", "").rstrip(".").replace("\\!", "")
+        string = re.sub(r"\\begin\{array\}\{.*?\}", r"\\begin{pmatrix}", string)
+        string = re.sub(r"\\end\{array\}", r"\\end{pmatrix}", string).replace("bmatrix", "pmatrix")
+        string = string.replace("tfrac", "frac").replace("dfrac", "frac").replace("\\neq", "\\ne").replace("\\leq", "\\le").replace("\\geq", "\\ge")
+        string = string.replace("\\left", "").replace("\\right", "").replace("\\{", "{").replace("\\}", "}")
+        
+        # Clean unit texts if needed
+        if not skip_unit:
+            string = self.unit_manager.clean_units(string)
+
+        string = string.replace("^{\\circ}", "").replace("^\\circ", "").replace("\\$", "").replace("$", "").replace("\\(", "").replace("\\)", "")
+        string = StringProcessor.convert_word_number(string)
+        string = re.sub(r"\\text\{(.*?)\}", r"\1", string)
+        
+        for key in ["x=", "y=", "z=", "x\\in", "y\\in", "z\\in", "x\\to", "y\\to", "z\\to"]:
+            string = string.replace(key, "")
+        
+        string = string.replace("\\emptyset", r"{}").replace("(-\\infty,\\infty)", "\\mathbb{R}")
+        string = string.replace("%", "").replace(" .", " 0.").replace("{.", "{0.")
+        
+        return string
+
+
+# Core Answer Extraction Logic Class
+class AnswerExtractor:
+    """
+    A class responsible for extracting the final answer from a prediction string.
+    """
+
+    def __init__(self, string_cleaner: StringCleaner):
+        """
+        Initializes the AnswerExtractor class with a string cleaner.
+        """
+        self.string_cleaner = string_cleaner
+
+    def extract_answer(self, pred_str, data_name, use_last_number=True):
+        """
+        Extracts the final answer from the prediction string, processing various formats.
+        """
+        pred_str = pred_str.replace("\u043a\u0438", "")
+        
+        # Handle special cases based on data_name or pattern
+        if "final answer is $" in pred_str and "$. I hope" in pred_str:
+            pred = pred_str.split("final answer is $", 1)[1].split("$. I hope", 1)[0].strip()
+        elif "boxed" in pred_str:
+            pred = self._extract_boxed_answer(pred_str)
+        elif "he answer is" in pred_str:
+            pred = pred_str.split("he answer is")[-1].strip()
+        else:
+            pred = self._get_last_number_answer(pred_str, use_last_number)
+        
+        pred = self.string_cleaner.strip_string(pred, skip_unit=data_name in ["carp_en", "minerva_math"])
+        return pred
+
+    def _extract_boxed_answer(self, pred_str):
+        """
+        Extracts answers enclosed in 'boxed' notation.
+        """
+        ans = pred_str.split("boxed")[-1]
+        if ans.startswith("{"):
+            return self._extract_bracketed_answer(ans)
+        else:
+            return ans.split("$")[0].strip()
+
+    def _extract_bracketed_answer(self, ans):
+        """
+        Handles answers that are enclosed within brackets.
+        """
+        stack = 1
+        result = ""
+        for c in ans[1:]:
+            if c == "{":
+                stack += 1
+                result += c
+            elif c == "}":
+                stack -= 1
+                if stack == 0:
+                    break
+                result += c
+            else:
+                result += c
+        return result
+
+    def _get_last_number_answer(self, pred_str, use_last_number):
+        """
+        Extracts the last number from the string if use_last_number is True.
+        """
+        if use_last_number:
+            pattern = "-?\d*\.?\d+"
+            pred = re.findall(pattern, pred_str.replace(",", ""))
+            return pred[-1] if pred else ""
+        return ""
+
+
+@PROCESSOR_REGISTRY.register()
+class AnswerGroundTruthFilter(TextFilter):
+    def __init__(self, args_dict: dict):
+        super().__init__(args_dict)
+        self.filter_name = 'AnswerGroundTruthFilter'
+        unit_manager = UnitTextManager()
+        string_cleaner = StringCleaner(unit_manager)
+        self.answer_extractor = AnswerExtractor(string_cleaner)
+
+    def filter_func(self, dataset):
+        indexes = np.zeros(len(dataset)).astype(int)
+        for i in range(len(dataset)):
+            final_answer =  self.answer_extractor.extract_answer(dataset[i]['answer'], dataset[i].get('data_name', None))
+            if 'ground_truth_answer'in dataset[i] and final_answer == dataset[i]['ground_truth_answer']:
+                indexes[i] = 1
+        return indexes
diff --git a/dataflow/process/text/reasoning/answer_ngram_filter.py b/dataflow/process/text/reasoning/answer_ngram_filter.py
@@ -0,0 +1,32 @@
+from dataflow.core import TextFilter
+import numpy as np
+import re
+from dataflow.utils.registry import PROCESSOR_REGISTRY
+from dataflow.Eval.Text import NgramScorer
+
+@PROCESSOR_REGISTRY.register()
+class AnswerNgramFilter(TextFilter):
+    def __init__(self, args_dict: dict):
+        super().__init__(args_dict)
+        self.filter_name = 'AnswerNgramFilter'
+        self.min_score = args_dict['min_score']
+        self.max_score = args_dict['max_score']
+        self.ngrams = args_dict['ngrams']
+
+    def filter_func(self, dataset):
+        scores = []
+        for sample in dataset:
+            answer = sample['answer']
+            content = answer.lower()
+            content = re.sub(r'[^\w\s]', '', content)
+            words = content.split()
+            ngrams = [' '.join(words[i:i + self.ngrams]) for i in range(len(words) - (self.ngrams - 1))]
+            unique_ngrams = set(ngrams)
+
+            total_ngrams = len(ngrams)
+            unique_ngrams_count = len(unique_ngrams)
+
+            repetition_score = unique_ngrams_count / total_ngrams if total_ngrams > 0 else 0.0
+            scores.append(repetition_score) 
+
+        return np.array([self.min_score <= score <= self.max_score for score in scores]).astype(int)
diff --git a/dataflow/process/text/reasoning/answer_token_length_filter.py b/dataflow/process/text/reasoning/answer_token_length_filter.py
@@ -0,0 +1,20 @@
+from dataflow.core import TextFilter
+import numpy as np
+from dataflow.utils.registry import PROCESSOR_REGISTRY
+from transformers import AutoTokenizer
+
+@PROCESSOR_REGISTRY.register()
+class AnswerTokenLengthFilter(TextFilter):
+    def __init__(self, args_dict: dict):
+        super().__init__(args_dict)
+        self.filter_name = 'AnswerTokenLengthFilter'
+        self.max_answer_token_length = args_dict['max_answer_token_length']
+        self.tokenizer = AutoTokenizer.from_pretrained(args_dict['tokenizer_dir'])
+
+    def filter_func(self, dataset):
+        def get_token_count(input_string):
+            # 使用 tokenizer 对字符串进行编码，并获取 token 数目
+            tokens = self.tokenizer.encode(input_string, add_special_tokens=False)
+            return len(tokens)
+
+        return np.array([get_token_count(item['answer']) <= self.max_answer_token_length for item in dataset]).astype(int)
diff --git a/dataflow/utils/registry.py b/dataflow/utils/registry.py
@@ -74,7 +74,7 @@ def get(self, name):
                         raise e
                 raise KeyError(f"No object named '{name}' found in '{self._name}' registry!")
             elif self._name == "processor":
-                for x in ['text.refiners', 'text.filters', 'text.deduplicators', 'image.filters', 'image.deduplicators', 'video.filters']:
+                for x in ['text.refiners', 'text.filters', 'text.deduplicators', 'text.reasoning','image.filters', 'image.deduplicators', 'video.filters']:
                 # for x in ['image.filters', 'image.refiners']:
                     module_path = "dataflow.process." + x
                     try:
diff --git a/demos/reasoning_process/math_5_samples.json b/demos/reasoning_process/math_5_samples.json
diff --git a/processed.jsonl b/processed.jsonl