Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dingo/config/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from dingo.config.input_args import (DatasetArgs, DatasetExcelArgs, DatasetFieldArgs, DatasetHFConfigArgs, DatasetS3ConfigArgs, DatasetSqlArgs, EvalPipline, EvalPiplineConfig, # noqa E402.
EvaluatorLLMArgs, EvaluatorRuleArgs, ExecutorArgs, ExecutorResultSaveArgs, InputArgs)
from dingo.config.input_args import (DatasetArgs, DatasetCsvArgs, DatasetExcelArgs, DatasetFieldArgs, DatasetHFConfigArgs, DatasetS3ConfigArgs, DatasetSqlArgs, EvalPipline, # noqa E402.
EvalPiplineConfig, EvaluatorLLMArgs, EvaluatorRuleArgs, ExecutorArgs, ExecutorResultSaveArgs, InputArgs)
9 changes: 9 additions & 0 deletions dingo/config/input_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ class DatasetExcelArgs(BaseModel):
has_header: bool = True # 第一行是否为列名,False 则使用列序号作为列名


class DatasetCsvArgs(BaseModel):
has_header: bool = True # 第一行是否为列名,False 则使用 column_x 作为列名
encoding: str = 'utf-8' # 文件编码,默认 utf-8,支持 gbk, gb2312, latin1 等
dialect: str = 'excel' # CSV 格式方言:excel(默认), excel-tab, unix 等
delimiter: str | None = None # 分隔符,None 表示根据 dialect 自动选择
quotechar: str = '"' # 引号字符,默认双引号


class DatasetFieldArgs(BaseModel):
id: str = ''
prompt: str = ''
Expand All @@ -49,6 +57,7 @@ class DatasetArgs(BaseModel):
s3_config: DatasetS3ConfigArgs = DatasetS3ConfigArgs()
sql_config: DatasetSqlArgs = DatasetSqlArgs()
excel_config: DatasetExcelArgs = DatasetExcelArgs()
csv_config: DatasetCsvArgs = DatasetCsvArgs()


class ExecutorResultSaveArgs(BaseModel):
Expand Down
19 changes: 19 additions & 0 deletions dingo/data/converter/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,25 @@ def _convert(raw: Union[str, Dict]):
return _convert


@BaseConverter.register("csv")
class CsvConverter(BaseConverter):
"""CSV file converter."""

def __init__(self):
super().__init__()
Comment on lines +287 to +288
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The __init__ method only calls the parent's __init__, which is done automatically in Python if __init__ is not defined. This method is redundant and can be removed for cleaner code.


@classmethod
def convertor(cls, input_args: InputArgs) -> Callable:
def _convert(raw: Union[str, Dict]):
j = raw
if isinstance(raw, str):
j = json.loads(raw)
data_dict = j
return Data(**data_dict)

return _convert


@BaseConverter.register("listjson")
class ListJsonConverter(BaseConverter):
"""List json file converter."""
Expand Down
97 changes: 95 additions & 2 deletions dingo/data/datasource/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,94 @@ def _load_excel_file_xlsx(self, path: str) -> Generator[str, None, None]:
if wb:
wb.close()

def _load_csv_file(self, path: str) -> Generator[str, None, None]:
"""
Load a CSV file and return its contents row by row as JSON strings.
Supports streaming for large files, different encodings, and various CSV formats.

Args:
path (str): The path to the CSV file.

Returns:
Generator[str]: Each row as a JSON string with header keys.
"""
import csv

# 获取 CSV 配置
has_header = self.input_args.dataset.csv_config.has_header
encoding = self.input_args.dataset.csv_config.encoding
dialect = self.input_args.dataset.csv_config.dialect
delimiter = self.input_args.dataset.csv_config.delimiter
quotechar = self.input_args.dataset.csv_config.quotechar

try:
# 尝试使用指定的编码打开文件
with open(path, 'r', encoding=encoding, newline='') as csvfile:
# 设置 CSV reader 参数
reader_kwargs = {
'dialect': dialect,
'quotechar': quotechar,
}

# 如果指定了自定义分隔符,覆盖 dialect 的默认值
if delimiter is not None:
reader_kwargs['delimiter'] = delimiter

# 创建 CSV reader(流式读取)
csv_reader = csv.reader(csvfile, **reader_kwargs)

# 处理标题行
headers = None
# first_row_data = None

try:
first_row = next(csv_reader)
except StopIteration:
raise RuntimeError(f'CSV file "{path}" is empty')

if has_header:
# The first row is the header
headers = [str(h).strip() if h else f'column_{i}' for i, h in enumerate(first_row)]
data_rows = csv_reader
else:
# Generate headers and treat the first row as data
from itertools import chain
headers = [f'column_{i}' for i in range(len(first_row))]
data_rows = chain([first_row], csv_reader)

# Process all data rows in a single loop
for row in data_rows:
# Skip empty rows
if not row or all(not cell.strip() for cell in row):
continue

# Combine row data with headers into a dictionary, handling rows with fewer columns
row_dict = {
header: (row[i].strip() if row[i] else "") if i < len(row) else ""
for i, header in enumerate(headers)
}

# Yield the JSON string
yield json.dumps(row_dict, ensure_ascii=False) + '\n'

except UnicodeDecodeError as e:
# 编码错误提示
raise RuntimeError(
f'Failed to read CSV file "{path}" with encoding "{encoding}": {str(e)}. '
f'Please try a different encoding (e.g., "gbk", "gb2312", "latin1", "iso-8859-1").'
)
except csv.Error as e:
# CSV 格式错误
raise RuntimeError(
f'Failed to parse CSV file "{path}": {str(e)}. '
f'Current dialect: "{dialect}". You may need to adjust the dialect or delimiter parameter.'
)
except Exception as e:
raise RuntimeError(
f'Failed to read CSV file "{path}": {str(e)}. '
f'Please ensure the file is a valid CSV file.'
)

def _load_excel_file_xls(self, path: str) -> Generator[str, None, None]:
"""
Load an .xls Excel file and return its contents row by row as JSON strings.
Expand Down Expand Up @@ -241,8 +329,13 @@ def _load_local_file(self) -> Generator[str, None, None]:
by_line = self.input_args.dataset.format not in ["json", "listjson"]

for f in f_list:
# Check if file is CSV
if f.endswith('.csv'):
if self.input_args.dataset.format != 'csv':
raise RuntimeError(f'CSV file "{f}" is not supported. Please set dataset.format to "csv" to read CSV files.')
yield from self._load_csv_file(f)
# Check if file is Excel
if f.endswith('.xlsx'):
elif f.endswith('.xlsx'):
if self.input_args.dataset.format != 'excel':
raise RuntimeError(f'Excel file "{f}" is not supported. Please set dataset.format to "excel" to read Excel files.')
yield from self._load_excel_file_xlsx(f)
Expand Down Expand Up @@ -278,7 +371,7 @@ def _load_local_file(self) -> Generator[str, None, None]:
except UnicodeDecodeError as decode_error:
raise RuntimeError(
f'Failed to read file "{f}": Unsupported file format or encoding. '
f'Dingo only supports UTF-8 text files (.jsonl, .json, .txt), Excel files (.xlsx, .xls) and .gz compressed text files. '
f'Dingo only supports UTF-8 text files (.jsonl, .json, .txt), CSV files (.csv), Excel files (.xlsx, .xls) and .gz compressed text files. '
f'Original error: {str(decode_error)}'
)
except Exception as e:
Expand Down
3 changes: 1 addition & 2 deletions dingo/exec/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from dingo.exec.base import ExecProto, Executor # noqa E402.
from dingo.exec.local import LocalExecutor # noqa E402.
from dingo.utils import log

Expand All @@ -6,5 +7,3 @@
except Exception as e:
log.warning("Spark Executor not imported. Open debug log for more details.")
log.debug(str(e))

from dingo.exec.base import ExecProto, Executor # noqa E402.
8 changes: 6 additions & 2 deletions dingo/exec/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ def execute(self) -> SummaryModel:
self.summary.type_ratio[field_key] = {}

# 遍历 List[EvalDetail],同时收集指标分数和标签
label_set = set()
for eval_detail in eval_detail_list:
# 收集指标分数(按 field_key 分组)
if eval_detail.score is not None and eval_detail.metric:
Expand All @@ -123,8 +124,11 @@ def execute(self) -> SummaryModel:
# 收集标签统计
label_list = eval_detail.label if eval_detail.label else []
for label in label_list:
self.summary.type_ratio[field_key].setdefault(label, 0)
self.summary.type_ratio[field_key][label] += 1
label_set.add(label)

for label in label_set:
self.summary.type_ratio[field_key].setdefault(label, 0)
self.summary.type_ratio[field_key][label] += 1

if result_info.eval_status:
self.summary.num_bad += 1
Expand Down
Loading