1- import os
21import json
2+ import os
33from typing import Any , Dict , Generator , List , Optional
44
55from dingo .config import InputArgs
@@ -72,10 +72,10 @@ def _load_excel_file_xlsx(self, path: str) -> Generator[str, None, None]:
7272 try :
7373 # 使用只读模式加载工作簿,节省内存
7474 wb = load_workbook (filename = path , read_only = True , data_only = True )
75-
75+
7676 sheet_name = self .input_args .dataset .excel_config .sheet_name
7777 has_header = self .input_args .dataset .excel_config .has_header
78-
78+
7979 # 选择工作表
8080 if isinstance (sheet_name , str ):
8181 if sheet_name not in wb .sheetnames :
@@ -90,15 +90,15 @@ def _load_excel_file_xlsx(self, path: str) -> Generator[str, None, None]:
9090
9191 # 获取所有行的迭代器
9292 rows = ws .iter_rows (values_only = True )
93-
93+
9494 # 处理标题行
9595 if has_header :
9696 # 读取第一行作为标题
9797 headers = next (rows , None )
9898 if headers is None :
9999 wb .close ()
100100 raise RuntimeError (f'Excel file "{ path } " is empty' )
101-
101+
102102 # 将标题转换为列表,处理 None 值
103103 headers = [str (h ) if h is not None else f'Column_{ i } ' for i , h in enumerate (headers )]
104104 else :
@@ -107,34 +107,34 @@ def _load_excel_file_xlsx(self, path: str) -> Generator[str, None, None]:
107107 if first_row is None :
108108 wb .close ()
109109 raise RuntimeError (f'Excel file "{ path } " is empty' )
110-
110+
111111 # 使用列序号作为列名
112112 headers = [str (i ) for i in range (len (first_row ))]
113-
113+
114114 # 处理第一行数据
115115 if not all (cell is None for cell in first_row ):
116116 row_dict = {}
117117 for i , (header , value ) in enumerate (zip (headers , first_row )):
118118 row_dict [header ] = value if value is not None else ""
119119 yield json .dumps (row_dict , ensure_ascii = False ) + '\n '
120-
120+
121121 # 逐行读取数据并转换为 JSON
122122 for row in rows :
123123 # 跳过空行
124124 if all (cell is None for cell in row ):
125125 continue
126-
126+
127127 # 将行数据与标题组合成字典
128128 row_dict = {}
129129 for i , (header , value ) in enumerate (zip (headers , row )):
130130 # 处理值为 None 的情况
131131 row_dict [header ] = value if value is not None else ""
132-
132+
133133 # 转换为 JSON 字符串并 yield
134134 yield json .dumps (row_dict , ensure_ascii = False ) + '\n '
135-
135+
136136 wb .close ()
137-
137+
138138 except Exception as e :
139139 raise RuntimeError (
140140 f'Failed to read .xlsx file "{ path } ": { str (e )} . '
@@ -161,10 +161,10 @@ def _load_excel_file_xls(self, path: str) -> Generator[str, None, None]:
161161 try :
162162 # 打开工作簿
163163 wb = xlrd .open_workbook (path , on_demand = True )
164-
164+
165165 sheet_name = self .input_args .dataset .excel_config .sheet_name
166166 has_header = self .input_args .dataset .excel_config .has_header
167-
167+
168168 # 选择工作表
169169 if isinstance (sheet_name , str ):
170170 try :
@@ -180,38 +180,38 @@ def _load_excel_file_xls(self, path: str) -> Generator[str, None, None]:
180180
181181 if ws .nrows == 0 :
182182 raise RuntimeError (f'Excel file "{ path } " is empty' )
183-
183+
184184 # 处理标题行
185185 start_row = 0
186186 if has_header :
187187 # 读取第一行作为标题
188- headers = [str (cell .value ) if cell .value is not None else f'Column_{ i } '
188+ headers = [str (cell .value ) if cell .value is not None else f'Column_{ i } '
189189 for i , cell in enumerate (ws .row (0 ))]
190190 start_row = 1
191191 else :
192192 # 使用列序号作为列名
193193 headers = [str (i ) for i in range (ws .ncols )]
194194 start_row = 0
195-
195+
196196 # 逐行读取数据并转换为 JSON
197197 for row_idx in range (start_row , ws .nrows ):
198198 row = ws .row (row_idx )
199-
199+
200200 # 跳过空行
201201 if all (cell .value is None or cell .value == '' for cell in row ):
202202 continue
203-
203+
204204 # 将行数据与标题组合成字典
205205 row_dict = {}
206206 for i , (header , cell ) in enumerate (zip (headers , row )):
207207 # 处理值为 None 或空的情况
208208 row_dict [header ] = cell .value if cell .value is not None else ""
209-
209+
210210 # 转换为 JSON 字符串并 yield
211211 yield json .dumps (row_dict , ensure_ascii = False ) + '\n '
212-
212+
213213 wb .release_resources ()
214-
214+
215215 except Exception as e :
216216 raise RuntimeError (
217217 f'Failed to read .xls file "{ path } ": { str (e )} . '
@@ -229,7 +229,7 @@ def _load_local_file(self) -> Generator[str, None, None]:
229229
230230 if not os .path .exists (self .path ):
231231 raise RuntimeError (f'"{ self .path } " is not a valid path' )
232-
232+
233233 f_list = []
234234 if os .path .exists (self .path ) and os .path .isfile (self .path ):
235235 f_list = [self .path ]
@@ -284,4 +284,3 @@ def _load_local_file(self) -> Generator[str, None, None]:
284284 f'Unexpected error reading file "{ f } ": { str (e )} . '
285285 f'Please check if the file exists and is readable.'
286286 )
287-
0 commit comments