1414import shutil
1515import re
1616from pathlib import Path
17- from typing import Optional
17+ from typing import Optional , Union
1818
1919
2020# === 路径配置 ===
2121SCRIPT_DIR = Path (__file__ ).parent
2222REPO_ROOT = SCRIPT_DIR .parent .parent
2323CONFIG_PATH = SCRIPT_DIR / "config" / "cases.json"
24- EDIT_TOOL_CONFIG_ZH = SCRIPT_DIR .parent / "edit_tool" / "config" / "data_zh.json"
25- EDIT_TOOL_CONFIG_EN = SCRIPT_DIR .parent / "edit_tool" / "config" / "data_en.json"
24+ EDIT_TOOL_CONFIG = SCRIPT_DIR .parent / "edit_tool" / "config" / "data.json"
2625OUTPUT_DIR = REPO_ROOT / "minicpm-o-4_5"
2726COLLECTED_DIR = SCRIPT_DIR .parent / "collected"
2827
2928
29+ def get_text (obj : Union [str , dict ], lang : str = "zh" ) -> str :
30+ """从多语言对象中获取指定语言的文本
31+
32+ Args:
33+ obj: 字符串或 {"zh": "...", "en": "..."} 对象
34+ lang: 语言代码
35+
36+ Returns:
37+ 对应语言的文本
38+ """
39+ if isinstance (obj , dict ):
40+ return obj .get (lang , obj .get ("zh" , "" ))
41+ return obj
42+
43+
3044def load_config () -> dict :
3145 """加载配置文件
3246
3347 优先从 edit_tool 配置读取(如果存在),否则从 cases.json 读取
3448 """
35- if EDIT_TOOL_CONFIG_ZH .exists ():
36- print (f"[INFO] 从 edit_tool 配置加载: { EDIT_TOOL_CONFIG_ZH } " )
37- with open (EDIT_TOOL_CONFIG_ZH , "r" , encoding = "utf-8" ) as f :
49+ if EDIT_TOOL_CONFIG .exists ():
50+ print (f"[INFO] 从 edit_tool 配置加载: { EDIT_TOOL_CONFIG } " )
51+ with open (EDIT_TOOL_CONFIG , "r" , encoding = "utf-8" ) as f :
3852 return json .load (f )
3953
4054 print (f"[INFO] 从默认配置加载: { CONFIG_PATH } " )
4155 with open (CONFIG_PATH , "r" , encoding = "utf-8" ) as f :
4256 return json .load (f )
4357
4458
45- def find_session_dir (source_session : str , lang : str = "zh" ) -> Optional [Path ]:
59+ def find_session_dir (source_session : str , ability_id : str ) -> Optional [Path ]:
4660 """在 collected 目录中查找 session 目录
4761
4862 Args:
4963 source_session: session 目录名,如 session_20260129_034105_a264bd2a
50- lang: 语言,zh 或 en
64+ ability_id: ability ID,用于判断语言(english -> en, 其他 -> zh)
5165
5266 Returns:
5367 session 目录的 Path,找不到返回 None
5468 """
69+ # 根据 ability_id 判断语言
70+ lang = "en" if ability_id == "english" else "zh"
5571 lang_dir = COLLECTED_DIR / lang
72+
5673 if not lang_dir .exists ():
5774 return None
5875
@@ -160,12 +177,12 @@ def copy_audio_files(session_dir: Path, case_id: str, output_audio_dir: Path) ->
160177 return paths
161178
162179
163- def process_case (case : dict , lang : str , output_audio_dir : Path ) -> Optional [dict ]:
180+ def process_case (case : dict , ability_id : str , output_audio_dir : Path ) -> Optional [dict ]:
164181 """处理单个 case,读取 session 数据并复制音频
165182
166183 Args:
167184 case: case 配置
168- lang: 语言
185+ ability_id: 所属 ability 的 ID
169186 output_audio_dir: 输出音频目录
170187
171188 Returns:
@@ -187,7 +204,7 @@ def process_case(case: dict, lang: str, output_audio_dir: Path) -> Optional[dict
187204 print (f" [WARN] Case { case .get ('id' , '?' )} 缺少 source_session,跳过" )
188205 return None
189206
190- session_dir = find_session_dir (source_session , lang )
207+ session_dir = find_session_dir (source_session , ability_id )
191208 if not session_dir :
192209 print (f" [WARN] 找不到 session: { source_session } ,跳过" )
193210 return None
@@ -200,7 +217,7 @@ def process_case(case: dict, lang: str, output_audio_dir: Path) -> Optional[dict
200217 # 复制音频
201218 audio_paths = copy_audio_files (session_dir , case ["id" ], output_audio_dir )
202219
203- # 构建输出数据(不包含溯源信息 )
220+ # 构建输出数据(保留多语言 summary )
204221 output_case = {
205222 "id" : case ["id" ],
206223 "summary" : case .get ("summary" , "" ),
@@ -228,7 +245,7 @@ def process_case(case: dict, lang: str, output_audio_dir: Path) -> Optional[dict
228245 return output_case
229246
230247
231- def build_data_js (config : dict , output_dir : Path ) -> dict :
248+ def build_data (config : dict , output_dir : Path ) -> dict :
232249 """构建 data.js 数据
233250
234251 Args:
@@ -249,24 +266,21 @@ def build_data_js(config: dict, output_dir: Path) -> dict:
249266 for ability in config ["abilities" ]:
250267 output_ability = {
251268 "id" : ability ["id" ],
252- "name" : ability ["name" ],
253- "description" : ability .get ("description" , "" ),
269+ "name" : ability ["name" ], # 保留多语言对象
270+ "description" : ability .get ("description" , "" ), # 保留多语言对象
254271 "sub_abilities" : []
255272 }
256273
257274 for sub_ability in ability .get ("sub_abilities" , []):
258275 output_sub = {
259276 "id" : sub_ability ["id" ],
260- "name" : sub_ability ["name" ],
277+ "name" : sub_ability ["name" ], # 保留多语言对象
261278 "description" : sub_ability .get ("description" , "" ),
262279 "cases" : []
263280 }
264281
265- # 检测语言(简单判断:英文能力用 en,其他用 zh)
266- lang = "en" if ability ["id" ] == "english" else "zh"
267-
268282 for case in sub_ability .get ("cases" , []):
269- processed = process_case (case , lang , output_audio_dir )
283+ processed = process_case (case , ability [ "id" ] , output_audio_dir )
270284 if processed :
271285 output_sub ["cases" ].append (processed )
272286
@@ -290,31 +304,6 @@ def write_data_js(data: dict, output_dir: Path, filename: str = "data.js"):
290304 print (f"写入 { data_js_path } " )
291305
292306
293- def load_config_for_lang (lang : str ) -> Optional [dict ]:
294- """加载指定语言的配置
295-
296- Args:
297- lang: 语言代码,zh 或 en
298-
299- Returns:
300- 配置字典,如果不存在则返回 None
301- """
302- edit_tool_config = EDIT_TOOL_CONFIG_ZH if lang == "zh" else EDIT_TOOL_CONFIG_EN
303-
304- if edit_tool_config .exists ():
305- print (f"[INFO] 从 edit_tool 配置加载 ({ lang } ): { edit_tool_config } " )
306- with open (edit_tool_config , "r" , encoding = "utf-8" ) as f :
307- return json .load (f )
308-
309- # 回退到默认配置
310- if CONFIG_PATH .exists ():
311- print (f"[INFO] 从默认配置加载 ({ lang } ): { CONFIG_PATH } " )
312- with open (CONFIG_PATH , "r" , encoding = "utf-8" ) as f :
313- return json .load (f )
314-
315- return None
316-
317-
318307def main ():
319308 print ("=" * 60 )
320309 print ("MiniCPM-o 4.5 Demo Page Builder" )
@@ -326,46 +315,27 @@ def main():
326315 print (f"清理音频目录: { output_audio_dir } " )
327316 shutil .rmtree (output_audio_dir )
328317
329- total_cases_all = 0
318+ # 加载配置
319+ config = load_config ()
330320
331- # 处理中文版本
321+ # 构建数据
332322 print ("\n " + "=" * 40 )
333- print ("处理中文版本 " )
323+ print ("构建数据 " )
334324 print ("=" * 40 )
335- config_zh = load_config_for_lang ("zh" )
336- if config_zh :
337- output_data_zh = build_data_js (config_zh , OUTPUT_DIR )
338- write_data_js (output_data_zh , OUTPUT_DIR , "data.js" )
339- total_cases_zh = sum (
340- len (sub ["cases" ])
341- for ability in output_data_zh ["abilities" ]
342- for sub in ability ["sub_abilities" ]
343- )
344- print (f"中文版本完成:{ total_cases_zh } 个 cases" )
345- total_cases_all += total_cases_zh
346- else :
347- print ("[WARN] 未找到中文配置文件" )
325+ output_data = build_data (config , OUTPUT_DIR )
348326
349- # 处理英文版本
350- print ("\n " + "=" * 40 )
351- print ("处理英文版本" )
352- print ("=" * 40 )
353- config_en = load_config_for_lang ("en" )
354- if config_en :
355- output_data_en = build_data_js (config_en , OUTPUT_DIR )
356- write_data_js (output_data_en , OUTPUT_DIR , "data_en.js" )
357- total_cases_en = sum (
358- len (sub ["cases" ])
359- for ability in output_data_en ["abilities" ]
360- for sub in ability ["sub_abilities" ]
361- )
362- print (f"英文版本完成:{ total_cases_en } 个 cases" )
363- total_cases_all += total_cases_en
364- else :
365- print ("[WARN] 未找到英文配置文件" )
327+ # 写入单一的 data.js(包含多语言字段)
328+ write_data_js (output_data , OUTPUT_DIR , "data.js" )
329+
330+ # 统计
331+ total_cases = sum (
332+ len (sub ["cases" ])
333+ for ability in output_data ["abilities" ]
334+ for sub in ability ["sub_abilities" ]
335+ )
366336
367337 print ("\n " + "=" * 60 )
368- print (f"构建完成!共处理 { total_cases_all } 个 cases" )
338+ print (f"构建完成!共处理 { total_cases } 个 cases" )
369339 print ("=" * 60 )
370340
371341 return 0
0 commit comments