MigoXLab
diff --git a/‎dingo/config/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎dingo/config/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎dingo/config/input_args.py‎
Lines changed: 9 additions & 0 deletions b/‎dingo/config/input_args.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎dingo/data/converter/base.py‎
Lines changed: 19 additions & 0 deletions b/‎dingo/data/converter/base.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎dingo/data/datasource/local.py‎
Lines changed: 95 additions & 2 deletions b/‎dingo/data/datasource/local.py‎
Lines changed: 95 additions & 2 deletions
diff --git a/‎dingo/exec/__init__.py‎
Lines changed: 1 addition & 2 deletions b/‎dingo/exec/__init__.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎dingo/exec/local.py‎
Lines changed: 6 additions & 2 deletions b/‎dingo/exec/local.py‎
Lines changed: 6 additions & 2 deletions
@@ -1,2 +1,2 @@
-from dingo.config.input_args import (DatasetArgs, DatasetExcelArgs, DatasetFieldArgs, DatasetHFConfigArgs, DatasetS3ConfigArgs, DatasetSqlArgs, EvalPipline, EvalPiplineConfig,  # noqa E402.
-                                     EvaluatorLLMArgs, EvaluatorRuleArgs, ExecutorArgs, ExecutorResultSaveArgs, InputArgs)
+from dingo.config.input_args import (DatasetArgs, DatasetCsvArgs, DatasetExcelArgs, DatasetFieldArgs, DatasetHFConfigArgs, DatasetS3ConfigArgs, DatasetSqlArgs, EvalPipline,  # noqa E402.
+                                     EvalPiplineConfig, EvaluatorLLMArgs, EvaluatorRuleArgs, ExecutorArgs, ExecutorResultSaveArgs, InputArgs)
@@ -32,6 +32,14 @@ class DatasetExcelArgs(BaseModel):
     has_header: bool = True  # 第一行是否为列名，False 则使用列序号作为列名
 
 
+class DatasetCsvArgs(BaseModel):
+    has_header: bool = True  # 第一行是否为列名，False 则使用 column_x 作为列名
+    encoding: str = 'utf-8'  # 文件编码，默认 utf-8，支持 gbk, gb2312, latin1 等
+    dialect: str = 'excel'  # CSV 格式方言：excel(默认), excel-tab, unix 等
+    delimiter: str | None = None  # 分隔符，None 表示根据 dialect 自动选择
+    quotechar: str = '"'  # 引号字符，默认双引号
+
+
 class DatasetFieldArgs(BaseModel):
     id: str = ''
     prompt: str = ''
@@ -49,6 +57,7 @@ class DatasetArgs(BaseModel):
     s3_config: DatasetS3ConfigArgs = DatasetS3ConfigArgs()
     sql_config: DatasetSqlArgs = DatasetSqlArgs()
     excel_config: DatasetExcelArgs = DatasetExcelArgs()
+    csv_config: DatasetCsvArgs = DatasetCsvArgs()
 
 
 class ExecutorResultSaveArgs(BaseModel):
 
@@ -280,6 +280,25 @@ def _convert(raw: Union[str, Dict]):
         return _convert
 
 
+@BaseConverter.register("csv")
+class CsvConverter(BaseConverter):
+    """CSV file converter."""
+
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def convertor(cls, input_args: InputArgs) -> Callable:
+        def _convert(raw: Union[str, Dict]):
+            j = raw
+            if isinstance(raw, str):
+                j = json.loads(raw)
+            data_dict = j
+            return Data(**data_dict)
+
+        return _convert
+
+
 @BaseConverter.register("listjson")
 class ListJsonConverter(BaseConverter):
     """List json file converter."""
 
@@ -142,6 +142,94 @@ def _load_excel_file_xlsx(self, path: str) -> Generator[str, None, None]:
             if wb:
                 wb.close()
 
+    def _load_csv_file(self, path: str) -> Generator[str, None, None]:
+        """
+        Load a CSV file and return its contents row by row as JSON strings.
+        Supports streaming for large files, different encodings, and various CSV formats.
+
+        Args:
+            path (str): The path to the CSV file.
+
+        Returns:
+            Generator[str]: Each row as a JSON string with header keys.
+        """
+        import csv
+
+        # 获取 CSV 配置
+        has_header = self.input_args.dataset.csv_config.has_header
+        encoding = self.input_args.dataset.csv_config.encoding
+        dialect = self.input_args.dataset.csv_config.dialect
+        delimiter = self.input_args.dataset.csv_config.delimiter
+        quotechar = self.input_args.dataset.csv_config.quotechar
+
+        try:
+            # 尝试使用指定的编码打开文件
+            with open(path, 'r', encoding=encoding, newline='') as csvfile:
+                # 设置 CSV reader 参数
+                reader_kwargs = {
+                    'dialect': dialect,
+                    'quotechar': quotechar,
+                }
+
+                # 如果指定了自定义分隔符，覆盖 dialect 的默认值
+                if delimiter is not None:
+                    reader_kwargs['delimiter'] = delimiter
+
+                # 创建 CSV reader（流式读取）
+                csv_reader = csv.reader(csvfile, **reader_kwargs)
+
+                # 处理标题行
+                headers = None
+                # first_row_data = None
+
+                try:
+                    first_row = next(csv_reader)
+                except StopIteration:
+                    raise RuntimeError(f'CSV file "{path}" is empty')
+
+                if has_header:
+                    # The first row is the header
+                    headers = [str(h).strip() if h else f'column_{i}' for i, h in enumerate(first_row)]
+                    data_rows = csv_reader
+                else:
+                    # Generate headers and treat the first row as data
+                    from itertools import chain
+                    headers = [f'column_{i}' for i in range(len(first_row))]
+                    data_rows = chain([first_row], csv_reader)
+
+                # Process all data rows in a single loop
+                for row in data_rows:
+                    # Skip empty rows
+                    if not row or all(not cell.strip() for cell in row):
+                        continue
+
+                    # Combine row data with headers into a dictionary, handling rows with fewer columns
+                    row_dict = {
+                        header: (row[i].strip() if row[i] else "") if i < len(row) else ""
+                        for i, header in enumerate(headers)
+                    }
+
+                    # Yield the JSON string
+                    yield json.dumps(row_dict, ensure_ascii=False) + '\n'
+
+        except UnicodeDecodeError as e:
+            # 编码错误提示
+            raise RuntimeError(
+                f'Failed to read CSV file "{path}" with encoding "{encoding}": {str(e)}. '
+                f'Please try a different encoding (e.g., "gbk", "gb2312", "latin1", "iso-8859-1").'
+            )
+        except csv.Error as e:
+            # CSV 格式错误
+            raise RuntimeError(
+                f'Failed to parse CSV file "{path}": {str(e)}. '
+                f'Current dialect: "{dialect}". You may need to adjust the dialect or delimiter parameter.'
+            )
+        except Exception as e:
+            raise RuntimeError(
+                f'Failed to read CSV file "{path}": {str(e)}. '
+                f'Please ensure the file is a valid CSV file.'
+            )
+
     def _load_excel_file_xls(self, path: str) -> Generator[str, None, None]:
         """
         Load an .xls Excel file and return its contents row by row as JSON strings.
@@ -241,8 +329,13 @@ def _load_local_file(self) -> Generator[str, None, None]:
         by_line = self.input_args.dataset.format not in ["json", "listjson"]
 
         for f in f_list:
+            # Check if file is CSV
+            if f.endswith('.csv'):
+                if self.input_args.dataset.format != 'csv':
+                    raise RuntimeError(f'CSV file "{f}" is not supported. Please set dataset.format to "csv" to read CSV files.')
+                yield from self._load_csv_file(f)
             # Check if file is Excel
-            if f.endswith('.xlsx'):
+            elif f.endswith('.xlsx'):
                 if self.input_args.dataset.format != 'excel':
                     raise RuntimeError(f'Excel file "{f}" is not supported. Please set dataset.format to "excel" to read Excel files.')
                 yield from self._load_excel_file_xlsx(f)
@@ -278,7 +371,7 @@ def _load_local_file(self) -> Generator[str, None, None]:
                 except UnicodeDecodeError as decode_error:
                     raise RuntimeError(
                         f'Failed to read file "{f}": Unsupported file format or encoding. '
-                        f'Dingo only supports UTF-8 text files (.jsonl, .json, .txt), Excel files (.xlsx, .xls) and .gz compressed text files. '
+                        f'Dingo only supports UTF-8 text files (.jsonl, .json, .txt), CSV files (.csv), Excel files (.xlsx, .xls) and .gz compressed text files. '
                         f'Original error: {str(decode_error)}'
                     )
                 except Exception as e:
 
@@ -1,3 +1,4 @@
+from dingo.exec.base import ExecProto, Executor  # noqa E402.
 from dingo.exec.local import LocalExecutor  # noqa E402.
 from dingo.utils import log
 
@@ -6,5 +7,3 @@
 except Exception as e:
     log.warning("Spark Executor not imported. Open debug log for more details.")
     log.debug(str(e))
-
-from dingo.exec.base import ExecProto, Executor  # noqa E402.
@@ -115,6 +115,7 @@ def execute(self) -> SummaryModel:
                             self.summary.type_ratio[field_key] = {}
 
                         # 遍历 List[EvalDetail]，同时收集指标分数和标签
+                        label_set = set()
                         for eval_detail in eval_detail_list:
                             # 收集指标分数（按 field_key 分组）
                             if eval_detail.score is not None and eval_detail.metric:
@@ -123,8 +124,11 @@ def execute(self) -> SummaryModel:
                             # 收集标签统计
                             label_list = eval_detail.label if eval_detail.label else []
                             for label in label_list:
-                                self.summary.type_ratio[field_key].setdefault(label, 0)
-                                self.summary.type_ratio[field_key][label] += 1
+                                label_set.add(label)
+
+                        for label in label_set:
+                            self.summary.type_ratio[field_key].setdefault(label, 0)
+                            self.summary.type_ratio[field_key][label] += 1
 
                     if result_info.eval_status:
                         self.summary.num_bad += 1