Merge pull request #54 from e06084/main

e06084 · web-flow · commit c20043071e29 · 2025-11-10T11:49:06.000+08:00
feat: add dataset process script
diff --git a/README.md b/README.md
@@ -93,6 +93,8 @@ print(f"Overall Score: {result.overall_metrics['overall']:.4f}")
 
 - **trafilatura**: trafilatura抽取器
 - **resiliparse**: resiliparse抽取器
+- **llm-webkit**: llm-webkit 抽取器
+- **magic-html**: magic-html 抽取器
 - **自定义抽取器**: 通过继承 `BaseExtractor` 实现
 
 ## 评测榜单
diff --git a/scripts/README.md b/scripts/README.md
@@ -0,0 +1,154 @@
+# Scripts 使用说明
+
+本目录包含数据处理和分析的工具脚本。
+
+## 📋 主要脚本
+
+### 数据处理脚本
+
+| 脚本 | 功能 | 添加字段 |
+|------|------|----------|
+| `statics.py` | 统计分析 | `meta.level`, `meta.table`, `meta.code`, `meta.equation` |
+| `language_classify.py` | 语言检测 | `meta.language` |
+| `style_classify.py` | 类型分类 | `meta.style` |
+| `process_dataset.sh` | 一键处理 | 上述所有字段 |
+
+### 数据管理脚本
+
+| 脚本 | 功能 |
+|------|------|
+| `merge_jsonl.py` | 合并多个 JSONL 文件 |
+| `filter_by_scores.py` | 按评分筛选数据 |
+| `diff_jsonl.py` | 对比 JSONL 文件差异 |
+| `add_raw_html_field.py` | 添加原始 HTML 字段 |
+| `merge_meta_data.py` | 合并 meta 数据 |
+
+### 分析脚本
+
+| 脚本 | 功能 |
+|------|------|
+| `analyze_style_results.py` | 分析网页类型分布 |
+| `quick_style_stats.py` | 快速统计网页类型 |
+
+## 🚀 快速开始
+
+### 方式一：一键处理（推荐）
+
+```bash
+# 赋予执行权限（仅首次需要）
+chmod +x scripts/process_dataset.sh
+
+# 执行处理，默认用的 gpt-5 model
+./scripts/process_dataset.sh \
+  data/sample_dataset_with_fields.jsonl \
+  data/final_dataset.jsonl \
+  YOUR_API_KEY \
+  YOUR_BASE_URL
+```
+
+### 方式二：分步处理
+
+```bash
+# 步骤 1: 统计分析
+python scripts/statics.py \
+  --input data/input.jsonl \
+  --output data/step1.jsonl
+
+# 步骤 2: 语言检测
+python scripts/language_classify.py \
+  data/step1.jsonl \
+  --output data/step2.jsonl \
+  --api-key YOUR_API_KEY \
+  --base-url https://api.deepseek.com/v1
+
+# 步骤 3: 类型分类
+python scripts/style_classify.py \
+  data/step2.jsonl \
+  --output data/final.jsonl \
+  --api-key YOUR_API_KEY \
+  --base-url https://api.deepseek.com/v1
+```
+
+## 🔑 环境变量
+
+```bash
+# 设置 API 密钥（推荐）
+export OPENAI_API_KEY="your_api_key"
+
+# 或在命令行中通过 --api-key 参数传递
+```
+
+## ⚙️ 常用参数
+
+### statics.py
+
+```bash
+python scripts/statics.py --input <input_file> --output <output_file>
+```
+
+**无需 API 密钥**
+
+### language_classify.py
+
+```bash
+python scripts/language_classify.py <input_file> \
+  --output <output_file> \
+  --api-key <api_key> \
+  --base-url <base_url> \
+  --batch-size <size>
+```
+
+**参数：**
+- `--api-key`: API 密钥（必需）
+- `--base-url`: API 地址（默认：`https://api.deepseek.com/v1`）
+- `--batch-size`: 批处理大小（默认：100）
+
+### style_classify.py
+
+```bash
+python scripts/style_classify.py <input_file> \
+  --output <output_file> \
+  --api-key <api_key> \
+  --base-url <base_url> \
+  --batch-size <size>
+```
+
+**参数：**
+- `--api-key`: API 密钥（必需）
+- `--base-url`: API 地址（必需）
+- `--batch-size`: 批处理大小（默认：100）
+
+
+## 🐛 故障排查
+
+### 问题 1: API 调用失败
+
+```bash
+# 检查 API 密钥
+echo $OPENAI_API_KEY
+
+# 测试 API 连接
+curl https://api.deepseek.com/v1/models \
+  -H "Authorization: Bearer YOUR_API_KEY"
+```
+
+### 问题 2: 数据行数不匹配
+
+```bash
+# 检查文件行数
+wc -l data/input.jsonl
+wc -l data/output.jsonl
+
+# 检查 JSON 格式
+head -n 1 data/output.jsonl | python -m json.tool
+```
+
+### 问题 3: 脚本权限不足
+
+```bash
+# 赋予执行权限
+chmod +x scripts/process_dataset.sh
+chmod +x scripts/*.py
+```
+
+
diff --git a/scripts/process_dataset.sh b/scripts/process_dataset.sh
@@ -0,0 +1,216 @@
+#!/bin/bash
+
+###############################################################################
+# WebMainBench 数据集完整处理脚本
+# 
+# 功能：为数据集添加完整的 meta 字段
+# - meta.level, meta.table, meta.code, meta.equation (通过 statics.py)
+# - meta.language (通过 language_classify.py)
+# - meta.style (通过 style_classify.py)
+#
+# 使用方法：
+#   ./scripts/process_dataset.sh <input_file> <output_file> <api_key> [base_url]
+#
+# 示例：
+#   ./scripts/process_dataset.sh \
+#     data/sample_dataset_with_fields.jsonl \
+#     data/final_dataset.jsonl \
+#     sk-xxxxx \
+#     https://api.deepseek.com/v1
+###############################################################################
+
+set -e  # 遇到错误立即退出
+
+# 颜色定义
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# 打印带颜色的消息
+print_info() {
+    echo -e "${BLUE}ℹ️  $1${NC}"
+}
+
+print_success() {
+    echo -e "${GREEN}✅ $1${NC}"
+}
+
+print_warning() {
+    echo -e "${YELLOW}⚠️  $1${NC}"
+}
+
+print_error() {
+    echo -e "${RED}❌ $1${NC}"
+}
+
+print_step() {
+    echo -e "\n${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+    echo -e "${GREEN}$1${NC}"
+    echo -e "${GREEN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"
+}
+
+# 检查参数
+if [ $# -lt 3 ]; then
+    print_error "参数不足！"
+    echo ""
+    echo "使用方法："
+    echo "  $0 <input_file> <output_file> <api_key> [base_url] [batch_size]"
+    echo ""
+    echo "参数说明："
+    echo "  input_file   - 输入 JSONL 文件路径"
+    echo "  output_file  - 最终输出文件路径"
+    echo "  api_key      - OpenAI API 密钥"
+    echo "  base_url     - API 基础 URL (可选，默认: https://api.deepseek.com/v1)"
+    echo "  batch_size   - 批处理大小 (可选，默认: 50)"
+    echo ""
+    echo "示例："
+    echo "  $0 data/input.jsonl data/output.jsonl sk-xxxxx"
+    echo "  $0 data/input.jsonl data/output.jsonl sk-xxxxx https://api.openai.com/v1 100"
+    exit 1
+fi
+
+# 获取参数
+INPUT_FILE="$1"
+FINAL_OUTPUT="$2"
+API_KEY="$3"
+BASE_URL="${4:-https://api.deepseek.com/v1}"
+BATCH_SIZE="${5:-50}"
+
+# 生成中间文件名
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+TEMP_DIR="data/temp_${TIMESTAMP}"
+STEP1_OUTPUT="${TEMP_DIR}/step1_with_stats.jsonl"
+STEP2_OUTPUT="${TEMP_DIR}/step2_with_language.jsonl"
+
+# 验证输入文件
+if [ ! -f "$INPUT_FILE" ]; then
+    print_error "输入文件不存在: $INPUT_FILE"
+    exit 1
+fi
+
+# 创建临时目录
+mkdir -p "$TEMP_DIR"
+
+# 打印配置信息
+print_step "🚀 开始处理数据集"
+print_info "输入文件: $INPUT_FILE"
+print_info "输出文件: $FINAL_OUTPUT"
+print_info "API 地址: $BASE_URL"
+print_info "批处理大小: $BATCH_SIZE"
+print_info "临时目录: $TEMP_DIR"
+echo ""
+
+# 统计输入文件行数
+INPUT_LINES=$(wc -l < "$INPUT_FILE" | tr -d ' ')
+print_info "输入数据总数: $INPUT_LINES 条"
+echo ""
+
+# ============================================================================
+# 步骤 1: 添加统计字段
+# ============================================================================
+print_step "📊 步骤 1/3: 计算统计字段 (level, table, code, equation)"
+
+if python scripts/statics.py --input "$INPUT_FILE" --output "$STEP1_OUTPUT"; then
+    STEP1_LINES=$(wc -l < "$STEP1_OUTPUT" | tr -d ' ')
+    print_success "步骤 1 完成！处理了 $STEP1_LINES 条数据"
+    
+    # 验证数据完整性
+    if [ "$INPUT_LINES" -ne "$STEP1_LINES" ]; then
+        print_warning "数据行数不一致！输入: $INPUT_LINES, 输出: $STEP1_LINES"
+    fi
+else
+    print_error "步骤 1 失败！"
+    rm -rf "$TEMP_DIR"
+    exit 1
+fi
+
+# ============================================================================
+# 步骤 2: 添加语言字段
+# ============================================================================
+print_step "🌐 步骤 2/3: 检测语言 (language)"
+
+export OPENAI_API_KEY="$API_KEY"
+
+if python scripts/language_classify.py \
+    "$STEP1_OUTPUT" \
+    --output "$STEP2_OUTPUT" \
+    --api-key "$API_KEY" \
+    --base-url "$BASE_URL" \
+    --batch-size "$BATCH_SIZE"; then
+    
+    STEP2_LINES=$(wc -l < "$STEP2_OUTPUT" | tr -d ' ')
+    print_success "步骤 2 完成！处理了 $STEP2_LINES 条数据"
+    
+    # 验证数据完整性
+    if [ "$STEP1_LINES" -ne "$STEP2_LINES" ]; then
+        print_warning "数据行数不一致！输入: $STEP1_LINES, 输出: $STEP2_LINES"
+    fi
+else
+    print_error "步骤 2 失败！"
+    print_warning "保留中间文件: $STEP1_OUTPUT"
+    exit 1
+fi
+
+# ============================================================================
+# 步骤 3: 添加网页类型字段
+# ============================================================================
+print_step "🎨 步骤 3/3: 分类网页类型 (style)"
+
+if python scripts/style_classify.py \
+    "$STEP2_OUTPUT" \
+    --output "$FINAL_OUTPUT" \
+    --api-key "$API_KEY" \
+    --base-url "$BASE_URL" \
+    --batch-size "$BATCH_SIZE"; then
+    
+    FINAL_LINES=$(wc -l < "$FINAL_OUTPUT" | tr -d ' ')
+    print_success "步骤 3 完成！处理了 $FINAL_LINES 条数据"
+    
+    # 验证数据完整性
+    if [ "$STEP2_LINES" -ne "$FINAL_LINES" ]; then
+        print_warning "数据行数不一致！输入: $STEP2_LINES, 输出: $FINAL_LINES"
+    fi
+else
+    print_error "步骤 3 失败！"
+    print_warning "保留中间文件: $STEP2_OUTPUT"
+    exit 1
+fi
+
+# ============================================================================
+# 完成与清理
+# ============================================================================
+print_step "🎉 处理完成！"
+
+print_info "最终输出: $FINAL_OUTPUT"
+print_info "处理数据: $FINAL_LINES 条"
+echo ""
+
+# 询问是否删除临时文件
+read -p "是否删除临时文件？(y/n) " -n 1 -r
+echo ""
+if [[ $REPLY =~ ^[Yy]$ ]]; then
+    rm -rf "$TEMP_DIR"
+    print_success "已删除临时文件"
+else
+    print_info "临时文件保留在: $TEMP_DIR"
+fi
+
+# 显示输出文件示例
+print_step "📋 输出数据示例"
+print_info "查看第一条数据的 meta 字段："
+echo ""
+head -n 1 "$FINAL_OUTPUT" | python -c "
+import json
+import sys
+
+data = json.loads(sys.stdin.read())
+meta = data.get('meta', {})
+
+print('Meta 字段内容:')
+print(json.dumps(meta, indent=2, ensure_ascii=False))
+"
+
+print_success "全部完成！🎊"
+
diff --git a/scripts/statics.py b/scripts/statics.py