|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +############################################################################### |
| 4 | +# WebMainBench 数据集完整处理脚本 |
| 5 | +# |
| 6 | +# 功能:为数据集添加完整的 meta 字段 |
| 7 | +# - meta.level, meta.table, meta.code, meta.equation (通过 statics.py) |
| 8 | +# - meta.language (通过 language_classify.py) |
| 9 | +# - meta.style (通过 style_classify.py) |
| 10 | +# |
| 11 | +# 使用方法: |
| 12 | +# ./scripts/process_dataset.sh <input_file> <output_file> <api_key> [base_url] |
| 13 | +# |
| 14 | +# 示例: |
| 15 | +# ./scripts/process_dataset.sh \ |
| 16 | +# data/sample_dataset_with_fields.jsonl \ |
| 17 | +# data/final_dataset.jsonl \ |
| 18 | +# sk-xxxxx \ |
| 19 | +# https://api.deepseek.com/v1 |
| 20 | +############################################################################### |
| 21 | + |
| 22 | +set -e # 遇到错误立即退出 |
| 23 | + |
| 24 | +# 颜色定义 |
| 25 | +RED='\033[0;31m' |
| 26 | +GREEN='\033[0;32m' |
| 27 | +YELLOW='\033[1;33m' |
| 28 | +BLUE='\033[0;34m' |
| 29 | +NC='\033[0m' # No Color |
| 30 | + |
| 31 | +# 打印带颜色的消息 |
| 32 | +print_info() { |
| 33 | + echo -e "${BLUE}ℹ️ $1${NC}" |
| 34 | +} |
| 35 | + |
| 36 | +print_success() { |
| 37 | + echo -e "${GREEN}✅ $1${NC}" |
| 38 | +} |
| 39 | + |
| 40 | +print_warning() { |
| 41 | + echo -e "${YELLOW}⚠️ $1${NC}" |
| 42 | +} |
| 43 | + |
| 44 | +print_error() { |
| 45 | + echo -e "${RED}❌ $1${NC}" |
| 46 | +} |
| 47 | + |
| 48 | +print_step() { |
| 49 | + echo -e "\n${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" |
| 50 | + echo -e "${GREEN}$1${NC}" |
| 51 | + echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n" |
| 52 | +} |
| 53 | + |
| 54 | +# 检查参数 |
| 55 | +if [ $# -lt 3 ]; then |
| 56 | + print_error "参数不足!" |
| 57 | + echo "" |
| 58 | + echo "使用方法:" |
| 59 | + echo " $0 <input_file> <output_file> <api_key> [base_url] [batch_size]" |
| 60 | + echo "" |
| 61 | + echo "参数说明:" |
| 62 | + echo " input_file - 输入 JSONL 文件路径" |
| 63 | + echo " output_file - 最终输出文件路径" |
| 64 | + echo " api_key - OpenAI API 密钥" |
| 65 | + echo " base_url - API 基础 URL (可选,默认: https://api.deepseek.com/v1)" |
| 66 | + echo " batch_size - 批处理大小 (可选,默认: 50)" |
| 67 | + echo "" |
| 68 | + echo "示例:" |
| 69 | + echo " $0 data/input.jsonl data/output.jsonl sk-xxxxx" |
| 70 | + echo " $0 data/input.jsonl data/output.jsonl sk-xxxxx https://api.openai.com/v1 100" |
| 71 | + exit 1 |
| 72 | +fi |
| 73 | + |
| 74 | +# 获取参数 |
| 75 | +INPUT_FILE="$1" |
| 76 | +FINAL_OUTPUT="$2" |
| 77 | +API_KEY="$3" |
| 78 | +BASE_URL="${4:-https://api.deepseek.com/v1}" |
| 79 | +BATCH_SIZE="${5:-50}" |
| 80 | + |
| 81 | +# 生成中间文件名 |
| 82 | +TIMESTAMP=$(date +%Y%m%d_%H%M%S) |
| 83 | +TEMP_DIR="data/temp_${TIMESTAMP}" |
| 84 | +STEP1_OUTPUT="${TEMP_DIR}/step1_with_stats.jsonl" |
| 85 | +STEP2_OUTPUT="${TEMP_DIR}/step2_with_language.jsonl" |
| 86 | + |
| 87 | +# 验证输入文件 |
| 88 | +if [ ! -f "$INPUT_FILE" ]; then |
| 89 | + print_error "输入文件不存在: $INPUT_FILE" |
| 90 | + exit 1 |
| 91 | +fi |
| 92 | + |
| 93 | +# 创建临时目录 |
| 94 | +mkdir -p "$TEMP_DIR" |
| 95 | + |
| 96 | +# 打印配置信息 |
| 97 | +print_step "🚀 开始处理数据集" |
| 98 | +print_info "输入文件: $INPUT_FILE" |
| 99 | +print_info "输出文件: $FINAL_OUTPUT" |
| 100 | +print_info "API 地址: $BASE_URL" |
| 101 | +print_info "批处理大小: $BATCH_SIZE" |
| 102 | +print_info "临时目录: $TEMP_DIR" |
| 103 | +echo "" |
| 104 | + |
| 105 | +# 统计输入文件行数 |
| 106 | +INPUT_LINES=$(wc -l < "$INPUT_FILE" | tr -d ' ') |
| 107 | +print_info "输入数据总数: $INPUT_LINES 条" |
| 108 | +echo "" |
| 109 | + |
| 110 | +# ============================================================================ |
| 111 | +# 步骤 1: 添加统计字段 |
| 112 | +# ============================================================================ |
| 113 | +print_step "📊 步骤 1/3: 计算统计字段 (level, table, code, equation)" |
| 114 | + |
| 115 | +if python scripts/statics.py --input "$INPUT_FILE" --output "$STEP1_OUTPUT"; then |
| 116 | + STEP1_LINES=$(wc -l < "$STEP1_OUTPUT" | tr -d ' ') |
| 117 | + print_success "步骤 1 完成!处理了 $STEP1_LINES 条数据" |
| 118 | + |
| 119 | + # 验证数据完整性 |
| 120 | + if [ "$INPUT_LINES" -ne "$STEP1_LINES" ]; then |
| 121 | + print_warning "数据行数不一致!输入: $INPUT_LINES, 输出: $STEP1_LINES" |
| 122 | + fi |
| 123 | +else |
| 124 | + print_error "步骤 1 失败!" |
| 125 | + rm -rf "$TEMP_DIR" |
| 126 | + exit 1 |
| 127 | +fi |
| 128 | + |
| 129 | +# ============================================================================ |
| 130 | +# 步骤 2: 添加语言字段 |
| 131 | +# ============================================================================ |
| 132 | +print_step "🌐 步骤 2/3: 检测语言 (language)" |
| 133 | + |
| 134 | +export OPENAI_API_KEY="$API_KEY" |
| 135 | + |
| 136 | +if python scripts/language_classify.py \ |
| 137 | + "$STEP1_OUTPUT" \ |
| 138 | + --output "$STEP2_OUTPUT" \ |
| 139 | + --api-key "$API_KEY" \ |
| 140 | + --base-url "$BASE_URL" \ |
| 141 | + --batch-size "$BATCH_SIZE"; then |
| 142 | + |
| 143 | + STEP2_LINES=$(wc -l < "$STEP2_OUTPUT" | tr -d ' ') |
| 144 | + print_success "步骤 2 完成!处理了 $STEP2_LINES 条数据" |
| 145 | + |
| 146 | + # 验证数据完整性 |
| 147 | + if [ "$STEP1_LINES" -ne "$STEP2_LINES" ]; then |
| 148 | + print_warning "数据行数不一致!输入: $STEP1_LINES, 输出: $STEP2_LINES" |
| 149 | + fi |
| 150 | +else |
| 151 | + print_error "步骤 2 失败!" |
| 152 | + print_warning "保留中间文件: $STEP1_OUTPUT" |
| 153 | + exit 1 |
| 154 | +fi |
| 155 | + |
| 156 | +# ============================================================================ |
| 157 | +# 步骤 3: 添加网页类型字段 |
| 158 | +# ============================================================================ |
| 159 | +print_step "🎨 步骤 3/3: 分类网页类型 (style)" |
| 160 | + |
| 161 | +if python scripts/style_classify.py \ |
| 162 | + "$STEP2_OUTPUT" \ |
| 163 | + --output "$FINAL_OUTPUT" \ |
| 164 | + --api-key "$API_KEY" \ |
| 165 | + --base-url "$BASE_URL" \ |
| 166 | + --batch-size "$BATCH_SIZE"; then |
| 167 | + |
| 168 | + FINAL_LINES=$(wc -l < "$FINAL_OUTPUT" | tr -d ' ') |
| 169 | + print_success "步骤 3 完成!处理了 $FINAL_LINES 条数据" |
| 170 | + |
| 171 | + # 验证数据完整性 |
| 172 | + if [ "$STEP2_LINES" -ne "$FINAL_LINES" ]; then |
| 173 | + print_warning "数据行数不一致!输入: $STEP2_LINES, 输出: $FINAL_LINES" |
| 174 | + fi |
| 175 | +else |
| 176 | + print_error "步骤 3 失败!" |
| 177 | + print_warning "保留中间文件: $STEP2_OUTPUT" |
| 178 | + exit 1 |
| 179 | +fi |
| 180 | + |
| 181 | +# ============================================================================ |
| 182 | +# 完成与清理 |
| 183 | +# ============================================================================ |
| 184 | +print_step "🎉 处理完成!" |
| 185 | + |
| 186 | +print_info "最终输出: $FINAL_OUTPUT" |
| 187 | +print_info "处理数据: $FINAL_LINES 条" |
| 188 | +echo "" |
| 189 | + |
| 190 | +# 询问是否删除临时文件 |
| 191 | +read -p "是否删除临时文件?(y/n) " -n 1 -r |
| 192 | +echo "" |
| 193 | +if [[ $REPLY =~ ^[Yy]$ ]]; then |
| 194 | + rm -rf "$TEMP_DIR" |
| 195 | + print_success "已删除临时文件" |
| 196 | +else |
| 197 | + print_info "临时文件保留在: $TEMP_DIR" |
| 198 | +fi |
| 199 | + |
| 200 | +# 显示输出文件示例 |
| 201 | +print_step "📋 输出数据示例" |
| 202 | +print_info "查看第一条数据的 meta 字段:" |
| 203 | +echo "" |
| 204 | +head -n 1 "$FINAL_OUTPUT" | python -c " |
| 205 | +import json |
| 206 | +import sys |
| 207 | +
|
| 208 | +data = json.loads(sys.stdin.read()) |
| 209 | +meta = data.get('meta', {}) |
| 210 | +
|
| 211 | +print('Meta 字段内容:') |
| 212 | +print(json.dumps(meta, indent=2, ensure_ascii=False)) |
| 213 | +" |
| 214 | + |
| 215 | +print_success "全部完成!🎊" |
| 216 | + |
0 commit comments