77# - meta.level, meta.table, meta.code, meta.equation (通过 statics.py)
88# - meta.language (通过 language_classify.py)
99# - meta.style (通过 style_classify.py)
10+ # - 简化 meta 字段 (通过 simplify_meta.py)
1011#
1112# 使用方法:
1213# ./scripts/process_dataset.sh <input_file> <output_file> <api_key> [base_url]
@@ -83,6 +84,7 @@ TIMESTAMP=$(date +%Y%m%d_%H%M%S)
8384TEMP_DIR=" data/temp_${TIMESTAMP} "
8485STEP1_OUTPUT=" ${TEMP_DIR} /step1_with_stats.jsonl"
8586STEP2_OUTPUT=" ${TEMP_DIR} /step2_with_language.jsonl"
87+ STEP3_OUTPUT=" ${TEMP_DIR} /step3_with_style.jsonl"
8688
8789# 验证输入文件
8890if [ ! -f " $INPUT_FILE " ]; then
@@ -110,7 +112,7 @@ echo ""
110112# ============================================================================
111113# 步骤 1: 添加统计字段
112114# ============================================================================
113- print_step " 📊 步骤 1/3 : 计算统计字段 (level, table, code, equation)"
115+ print_step " 📊 步骤 1/4 : 计算统计字段 (level, table, code, equation)"
114116
115117if python scripts/statics.py --input " $INPUT_FILE " --output " $STEP1_OUTPUT " ; then
116118 STEP1_LINES=$( wc -l < " $STEP1_OUTPUT " | tr -d ' ' )
129131# ============================================================================
130132# 步骤 2: 添加语言字段
131133# ============================================================================
132- print_step " 🌐 步骤 2/3 : 检测语言 (language)"
134+ print_step " 🌐 步骤 2/4 : 检测语言 (language)"
133135
134136export OPENAI_API_KEY=" $API_KEY "
135137
156158# ============================================================================
157159# 步骤 3: 添加网页类型字段
158160# ============================================================================
159- print_step " 🎨 步骤 3/3 : 分类网页类型 (style)"
161+ print_step " 🎨 步骤 3/4 : 分类网页类型 (style)"
160162
161163if python scripts/style_classify.py \
162164 " $STEP2_OUTPUT " \
163- --output " $FINAL_OUTPUT " \
165+ --output " $STEP3_OUTPUT " \
164166 --api-key " $API_KEY " \
165167 --base-url " $BASE_URL " \
166168 --batch-size " $BATCH_SIZE " ; then
167169
168- FINAL_LINES =$( wc -l < " $FINAL_OUTPUT " | tr -d ' ' )
169- print_success " 步骤 3 完成!处理了 $FINAL_LINES 条数据"
170+ STEP3_LINES =$( wc -l < " $STEP3_OUTPUT " | tr -d ' ' )
171+ print_success " 步骤 3 完成!处理了 $STEP3_LINES 条数据"
170172
171173 # 验证数据完整性
172- if [ " $STEP2_LINES " -ne " $FINAL_LINES " ]; then
173- print_warning " 数据行数不一致!输入: $STEP2_LINES , 输出: $FINAL_LINES "
174+ if [ " $STEP2_LINES " -ne " $STEP3_LINES " ]; then
175+ print_warning " 数据行数不一致!输入: $STEP2_LINES , 输出: $STEP3_LINES "
174176 fi
175177else
176178 print_error " 步骤 3 失败!"
177179 print_warning " 保留中间文件: $STEP2_OUTPUT "
178180 exit 1
179181fi
180182
183+ # ============================================================================
184+ # 步骤 4: 简化 meta 字段
185+ # ============================================================================
186+ print_step " 🔧 步骤 4/4: 简化 meta 字段 (只保留核心字段)"
187+
188+ if python scripts/simplify_meta.py \
189+ " $STEP3_OUTPUT " \
190+ --output " $FINAL_OUTPUT " ; then
191+
192+ FINAL_LINES=$( wc -l < " $FINAL_OUTPUT " | tr -d ' ' )
193+ print_success " 步骤 4 完成!处理了 $FINAL_LINES 条数据"
194+
195+ # 验证数据完整性
196+ if [ " $STEP3_LINES " -ne " $FINAL_LINES " ]; then
197+ print_warning " 数据行数不一致!输入: $STEP3_LINES , 输出: $FINAL_LINES "
198+ fi
199+ else
200+ print_error " 步骤 4 失败!"
201+ print_warning " 保留中间文件: $STEP3_OUTPUT "
202+ exit 1
203+ fi
204+
181205# ============================================================================
182206# 完成与清理
183207# ============================================================================
199223
200224# 显示输出文件示例
201225print_step " 📋 输出数据示例"
202- print_info " 查看第一条数据的 meta 字段:"
226+ print_info " 查看第一条数据的 meta 字段(已简化) :"
203227echo " "
204228head -n 1 " $FINAL_OUTPUT " | python -c "
205229import json
@@ -208,8 +232,10 @@ import sys
208232data = json.loads(sys.stdin.read())
209233meta = data.get('meta', {})
210234
211- print('Meta 字段内容:')
235+ print('Meta 字段内容(已简化) :')
212236print(json.dumps(meta, indent=2, ensure_ascii=False))
237+ print()
238+ print('包含字段:', ', '.join(meta.keys()))
213239"
214240
215241print_success " 全部完成!🎊"
0 commit comments