Skip to content

Commit 5ea5302

Browse files
author
sunweiyue
committed
refactor: merge zh/en data into single multi-lang structure
- Restructure cases.json with multi-lang fields: name, description, summary - Each field now supports { "zh": "...", "en": "..." } format - Update build.py to output single data.js with multi-lang data - Update index.html with client-side language switching - Single page supports both languages via localStorage - getText() helper extracts text based on current language - Remove separate data_en.js and index_zh.html (no longer needed) - Simplify edit_tool server.py for single data.json
1 parent dc99b23 commit 5ea5302

File tree

8 files changed

+561
-2814
lines changed

8 files changed

+561
-2814
lines changed

develop/edit_tool/config/data_en.json

Lines changed: 0 additions & 799 deletions
This file was deleted.

develop/edit_tool/server.py

Lines changed: 160 additions & 381 deletions
Large diffs are not rendered by default.

develop/minicpm-o-4_5/build.py

Lines changed: 49 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -14,45 +14,62 @@
1414
import shutil
1515
import re
1616
from pathlib import Path
17-
from typing import Optional
17+
from typing import Optional, Union
1818

1919

2020
# === 路径配置 ===
2121
SCRIPT_DIR = Path(__file__).parent
2222
REPO_ROOT = SCRIPT_DIR.parent.parent
2323
CONFIG_PATH = SCRIPT_DIR / "config" / "cases.json"
24-
EDIT_TOOL_CONFIG_ZH = SCRIPT_DIR.parent / "edit_tool" / "config" / "data_zh.json"
25-
EDIT_TOOL_CONFIG_EN = SCRIPT_DIR.parent / "edit_tool" / "config" / "data_en.json"
24+
EDIT_TOOL_CONFIG = SCRIPT_DIR.parent / "edit_tool" / "config" / "data.json"
2625
OUTPUT_DIR = REPO_ROOT / "minicpm-o-4_5"
2726
COLLECTED_DIR = SCRIPT_DIR.parent / "collected"
2827

2928

29+
def get_text(obj: Union[str, dict], lang: str = "zh") -> str:
30+
"""从多语言对象中获取指定语言的文本
31+
32+
Args:
33+
obj: 字符串或 {"zh": "...", "en": "..."} 对象
34+
lang: 语言代码
35+
36+
Returns:
37+
对应语言的文本
38+
"""
39+
if isinstance(obj, dict):
40+
return obj.get(lang, obj.get("zh", ""))
41+
return obj
42+
43+
3044
def load_config() -> dict:
3145
"""加载配置文件
3246
3347
优先从 edit_tool 配置读取(如果存在),否则从 cases.json 读取
3448
"""
35-
if EDIT_TOOL_CONFIG_ZH.exists():
36-
print(f"[INFO] 从 edit_tool 配置加载: {EDIT_TOOL_CONFIG_ZH}")
37-
with open(EDIT_TOOL_CONFIG_ZH, "r", encoding="utf-8") as f:
49+
if EDIT_TOOL_CONFIG.exists():
50+
print(f"[INFO] 从 edit_tool 配置加载: {EDIT_TOOL_CONFIG}")
51+
with open(EDIT_TOOL_CONFIG, "r", encoding="utf-8") as f:
3852
return json.load(f)
3953

4054
print(f"[INFO] 从默认配置加载: {CONFIG_PATH}")
4155
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
4256
return json.load(f)
4357

4458

45-
def find_session_dir(source_session: str, lang: str = "zh") -> Optional[Path]:
59+
def find_session_dir(source_session: str, ability_id: str) -> Optional[Path]:
4660
"""在 collected 目录中查找 session 目录
4761
4862
Args:
4963
source_session: session 目录名,如 session_20260129_034105_a264bd2a
50-
lang: 语言,zh 或 en
64+
ability_id: ability ID,用于判断语言(english -> en, 其他 -> zh)
5165
5266
Returns:
5367
session 目录的 Path,找不到返回 None
5468
"""
69+
# 根据 ability_id 判断语言
70+
lang = "en" if ability_id == "english" else "zh"
5571
lang_dir = COLLECTED_DIR / lang
72+
5673
if not lang_dir.exists():
5774
return None
5875

@@ -160,12 +177,12 @@ def copy_audio_files(session_dir: Path, case_id: str, output_audio_dir: Path) ->
160177
return paths
161178

162179

163-
def process_case(case: dict, lang: str, output_audio_dir: Path) -> Optional[dict]:
180+
def process_case(case: dict, ability_id: str, output_audio_dir: Path) -> Optional[dict]:
164181
"""处理单个 case,读取 session 数据并复制音频
165182
166183
Args:
167184
case: case 配置
168-
lang: 语言
185+
ability_id: 所属 ability 的 ID
169186
output_audio_dir: 输出音频目录
170187
171188
Returns:
@@ -187,7 +204,7 @@ def process_case(case: dict, lang: str, output_audio_dir: Path) -> Optional[dict
187204
print(f" [WARN] Case {case.get('id', '?')} 缺少 source_session,跳过")
188205
return None
189206

190-
session_dir = find_session_dir(source_session, lang)
207+
session_dir = find_session_dir(source_session, ability_id)
191208
if not session_dir:
192209
print(f" [WARN] 找不到 session: {source_session},跳过")
193210
return None
@@ -200,7 +217,7 @@ def process_case(case: dict, lang: str, output_audio_dir: Path) -> Optional[dict
200217
# 复制音频
201218
audio_paths = copy_audio_files(session_dir, case["id"], output_audio_dir)
202219

203-
# 构建输出数据(不包含溯源信息
220+
# 构建输出数据(保留多语言 summary
204221
output_case = {
205222
"id": case["id"],
206223
"summary": case.get("summary", ""),
@@ -228,7 +245,7 @@ def process_case(case: dict, lang: str, output_audio_dir: Path) -> Optional[dict
228245
return output_case
229246

230247

231-
def build_data_js(config: dict, output_dir: Path) -> dict:
248+
def build_data(config: dict, output_dir: Path) -> dict:
232249
"""构建 data.js 数据
233250
234251
Args:
@@ -249,24 +266,21 @@ def build_data_js(config: dict, output_dir: Path) -> dict:
249266
for ability in config["abilities"]:
250267
output_ability = {
251268
"id": ability["id"],
252-
"name": ability["name"],
253-
"description": ability.get("description", ""),
269+
"name": ability["name"], # 保留多语言对象
270+
"description": ability.get("description", ""), # 保留多语言对象
254271
"sub_abilities": []
255272
}
256273

257274
for sub_ability in ability.get("sub_abilities", []):
258275
output_sub = {
259276
"id": sub_ability["id"],
260-
"name": sub_ability["name"],
277+
"name": sub_ability["name"], # 保留多语言对象
261278
"description": sub_ability.get("description", ""),
262279
"cases": []
263280
}
264281

265-
# 检测语言(简单判断:英文能力用 en,其他用 zh)
266-
lang = "en" if ability["id"] == "english" else "zh"
267-
268282
for case in sub_ability.get("cases", []):
269-
processed = process_case(case, lang, output_audio_dir)
283+
processed = process_case(case, ability["id"], output_audio_dir)
270284
if processed:
271285
output_sub["cases"].append(processed)
272286

@@ -290,31 +304,6 @@ def write_data_js(data: dict, output_dir: Path, filename: str = "data.js"):
290304
print(f"写入 {data_js_path}")
291305

292306

293-
def load_config_for_lang(lang: str) -> Optional[dict]:
294-
"""加载指定语言的配置
295-
296-
Args:
297-
lang: 语言代码,zh 或 en
298-
299-
Returns:
300-
配置字典,如果不存在则返回 None
301-
"""
302-
edit_tool_config = EDIT_TOOL_CONFIG_ZH if lang == "zh" else EDIT_TOOL_CONFIG_EN
303-
304-
if edit_tool_config.exists():
305-
print(f"[INFO] 从 edit_tool 配置加载 ({lang}): {edit_tool_config}")
306-
with open(edit_tool_config, "r", encoding="utf-8") as f:
307-
return json.load(f)
308-
309-
# 回退到默认配置
310-
if CONFIG_PATH.exists():
311-
print(f"[INFO] 从默认配置加载 ({lang}): {CONFIG_PATH}")
312-
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
313-
return json.load(f)
314-
315-
return None
316-
317-
318307
def main():
319308
print("=" * 60)
320309
print("MiniCPM-o 4.5 Demo Page Builder")
@@ -326,46 +315,27 @@ def main():
326315
print(f"清理音频目录: {output_audio_dir}")
327316
shutil.rmtree(output_audio_dir)
328317

329-
total_cases_all = 0
318+
# 加载配置
319+
config = load_config()
330320

331-
# 处理中文版本
321+
# 构建数据
332322
print("\n" + "=" * 40)
333-
print("处理中文版本")
323+
print("构建数据")
334324
print("=" * 40)
335-
config_zh = load_config_for_lang("zh")
336-
if config_zh:
337-
output_data_zh = build_data_js(config_zh, OUTPUT_DIR)
338-
write_data_js(output_data_zh, OUTPUT_DIR, "data.js")
339-
total_cases_zh = sum(
340-
len(sub["cases"])
341-
for ability in output_data_zh["abilities"]
342-
for sub in ability["sub_abilities"]
343-
)
344-
print(f"中文版本完成:{total_cases_zh} 个 cases")
345-
total_cases_all += total_cases_zh
346-
else:
347-
print("[WARN] 未找到中文配置文件")
325+
output_data = build_data(config, OUTPUT_DIR)
348326

349-
# 处理英文版本
350-
print("\n" + "=" * 40)
351-
print("处理英文版本")
352-
print("=" * 40)
353-
config_en = load_config_for_lang("en")
354-
if config_en:
355-
output_data_en = build_data_js(config_en, OUTPUT_DIR)
356-
write_data_js(output_data_en, OUTPUT_DIR, "data_en.js")
357-
total_cases_en = sum(
358-
len(sub["cases"])
359-
for ability in output_data_en["abilities"]
360-
for sub in ability["sub_abilities"]
361-
)
362-
print(f"英文版本完成:{total_cases_en} 个 cases")
363-
total_cases_all += total_cases_en
364-
else:
365-
print("[WARN] 未找到英文配置文件")
327+
# 写入单一的 data.js(包含多语言字段)
328+
write_data_js(output_data, OUTPUT_DIR, "data.js")
329+
330+
# 统计
331+
total_cases = sum(
332+
len(sub["cases"])
333+
for ability in output_data["abilities"]
334+
for sub in ability["sub_abilities"]
335+
)
366336

367337
print("\n" + "=" * 60)
368-
print(f"构建完成!共处理 {total_cases_all} 个 cases")
338+
print(f"构建完成!共处理 {total_cases} 个 cases")
369339
print("=" * 60)
370340

371341
return 0

0 commit comments

Comments
 (0)