e06084
diff --git a/‎README.md‎
Lines changed: 112 additions & 16 deletions b/‎README.md‎
Lines changed: 112 additions & 16 deletions
diff --git a/‎examples/basic_usage.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/basic_usage.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/diff_jsonl.py‎
Lines changed: 211 additions & 0 deletions b/‎scripts/diff_jsonl.py‎
Lines changed: 211 additions & 0 deletions
@@ -74,23 +74,30 @@ print(f"Overall Score: {result.overall_metrics['overall']:.4f}")
 {
   "track_id": "0b7f2636-d35f-40bf-9b7f-94be4bcbb396",
   "html": "<html><body><h1 cc-select=\"true\">这是标题</h1></body></html>",   # 人工标注带cc-select="true" 属性
-  "groundtruth_content": "# 标题\n\n正文内容",
-  "groundtruth_content_list": [
-      {"type": "heading", "content": "标题", "level": 1},
-      {"type": "paragraph", "content": "正文内容"}
-   ],
   "url": "https://orderyourbooks.com/product-category/college-books-p-u/?products-per-page=all",
-  "layout_id": "orderyourbooks.com_4",
-  "max_layer_n": 10,
-  "url_host_name": "orderyourbooks.com",
-  "raw_warc_path": "s3://cc-raw-huawei/crawl-data/CC-MAIN-2025-13/segments/1742004433093.21/warc/CC-MAIN-20250319080618-20250319110618-00909.warc.gz?bytes=461610805,172252",
-  "language": "en",
-  "__dom_depth": 19,
-  "__dom_width": 10231,
-  "__type": "__max_depth",
-  "__tag": "DOM_WIDTH",
-  "marked_type": "unwanted",  # normal：正常标注的网页；unable：正文内容无法抉择；unwanted：无需标注的网页；
-  "unwanted_reason": "list"
+  "main_html": "<h1 cc-select=\"true\">这是标题</h1>",  # 从html中剪枝得到的正文html
+  "convert_main_content": "# 这是标题",  # 从main_html+html2text转化来
+  "groundtruth_content": "# 这是标题",  # 人工校准的markdown（部分提供）
+  "meta": {
+    "language": "en",  # 网页的语言
+    "style": "artical",  # 网页的文体
+    "DOM_WIDTH": 176,
+    "DOM_DEPTH": 27,
+    "text_linktext_ratio": 0.12252270850536746,
+    "table_text_ratio": 0,
+    "table_dom_depth": -1,
+    "text_distribution_dispersion": 0.2663,
+    "table": [],  # [], ["layout"], ["data"], ["layout", "data"]
+    "equation": [],  # [], ["inline"], ["interline"], ["inline", "interline"]
+    "code": [],  # [], ["inline"], ["interline"], ["inline", "interline"]
+    "table_complexity_score": 0,
+    "dom_complexity_score": 0.8442,
+    "text_dispersion_score": 0.2663,
+    "content_diversity_score": 0,
+    "link_complexity_score": 0.1225,
+    "overall_complexity_score": 0.3083,
+    "level": "mid"  # simple, mid, hard
+  }
 }
 ```
 
@@ -197,6 +204,95 @@ class MyExtractor(BaseExtractor):
 ExtractorFactory.register("my-extractor", MyExtractor)
 ```
 
+### 数据集统计分析工具
+
+WebMainBench 提供了强大的数据集统计分析工具 `scripts/statics.py`，用于分析数据集的各种特征并自动生成复杂度评分和难易程度分类。
+
+#### 功能特性
+
+- **DOM结构分析**：计算网页DOM树的深度和宽度
+- **文本链接比例分析**：统计文本与链接的比例关系
+- **表格复杂度分析**：评估表格内容的复杂程度
+- **内容类型检测**：自动识别公式、代码、表格等特殊内容
+- **复杂度评分**：基于多维度指标计算综合复杂度得分
+- **动态难易程度分类**：基于数据分布自动分类为 simple/mid/hard
+
+#### 使用方法
+
+```bash
+# 基本用法
+python scripts/statics.py data/input.jsonl --output data/output_with_stats.jsonl
+
+# 使用默认数据集
+python scripts/statics.py
+```
+
+#### 参数说明
+
+```bash
+# 查看所有可用参数
+python scripts/statics.py --help
+
+```
+
+#### 输出结果
+
+工具会在每条数据的 `meta` 字段中添加以下统计信息：
+
+```json
+{
+  "meta": {
+    "DOM_DEPTH": 25,                    // DOM树深度
+    "DOM_WIDTH": 1200,                  // DOM树宽度
+    "text_linktext_ratio": 0.85,        // 文本链接比例
+    "table_complexity_score": 0.3,      // 表格复杂度得分
+    "dom_complexity_score": 0.6,        // DOM复杂度得分
+    "text_dispersion_score": 0.4,       // 文本分布得分
+    "content_diversity_score": 0.7,     // 内容多样性得分
+    "link_complexity_score": 0.5,       // 链接复杂度得分
+    "overall_complexity_score": 0.52,   // 综合复杂度得分
+    "level": "mid"                      // 难易程度 (simple/mid/hard)
+  }
+}
+```
+
+#### 复杂度评分算法
+
+综合复杂度得分由以下维度加权计算：
+
+- **DOM结构复杂度 (25%)**：基于DOM深度和宽度，使用动态归一化
+- **文本分布复杂度 (25%)**：基于文本在DOM中的分布离散程度
+- **内容多样性 (25%)**：基于公式、代码、表格等特殊内容的种类
+- **链接复杂度 (25%)**：基于文本与链接的比例关系
+
+#### 运行示例
+
+```bash
+# 分析数据集并生成统计报告
+python scripts/statics.py data/sample_dataset.jsonl --output data/analyzed_dataset.jsonl
+
+# 输出示例：
+🔄 第一阶段: 计算基础统计和复杂度得分...
+  📊 已处理 100 条数据...
+  📊 已处理 200 条数据...
+
+🔄 第二阶段: 计算动态阈值和难易程度分类...
+📊 复杂度分布阈值计算:
+   总样本数: 1,827
+   30%分位数 (simple/mid分界): 0.3245
+   70%分位数 (mid/hard分界): 0.6789
+   复杂度得分范围: 0.0944 - 1.0000
+
+📊 难易程度分类结果:
+   Simple: 548 (30.0%)
+   Mid:    731 (40.0%)  
+   Hard:   548 (30.0%)
+
+📝 正在写入数据到: data/analyzed_dataset.jsonl
+✅ 成功写入 1,827 条数据
+```
+
+
 ## 项目架构
 
 ```
 
@@ -889,7 +889,7 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
 
     # 1. 从真实数据集加载包含预处理HTML的数据
     print("1. 从真实数据集加载预处理HTML数据...")
-    dataset_path = Path("data/WebMainBench_1827_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl")
+    dataset_path = Path("data/track_id_diff_result_56.jsonl")
     print(f"📂 数据集文件: {dataset_path}")
 
     # 加载数据集
 
@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+"""
+比较两个JSONL文件，找出track_id在文件1中存在但在文件2中不存在的数据
+"""
+import json
+import sys
+from pathlib import Path
+
+def load_track_ids(jsonl_file):
+    """
+    从JSONL文件中加载所有track_id
+    
+    Args:
+        jsonl_file: JSONL文件路径
+        
+    Returns:
+        set: track_id集合
+    """
+    track_ids = set()
+    file_path = Path(jsonl_file)
+    
+    if not file_path.exists():
+        print(f"❌ 文件不存在: {file_path}")
+        return track_ids
+    
+    print(f"📖 正在读取文件: {file_path.name}")
+    
+    line_count = 0
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line_num, line in enumerate(f, 1):
+                line = line.strip()
+                if not line:
+                    continue
+                    
+                line_count += 1
+                
+                # 每处理1000行显示进度
+                if line_count % 1000 == 0:
+                    print(f"  📊 已处理 {line_count} 行...")
+                
+                try:
+                    data = json.loads(line)
+                    track_id = data.get('track_id')
+                    
+                    if track_id:
+                        track_ids.add(track_id)
+                        
+                except json.JSONDecodeError as e:
+                    print(f"  ⚠️ 第 {line_num} 行JSON解析错误: {e}")
+                    continue
+                    
+    except Exception as e:
+        print(f"❌ 读取文件时出错: {e}")
+        return set()
+    
+    print(f"  ✅ 共找到 {len(track_ids)} 个唯一track_id")
+    return track_ids
+
+def load_data_with_track_ids(jsonl_file, target_track_ids):
+    """
+    从JSONL文件中加载指定track_id的数据
+    
+    Args:
+        jsonl_file: JSONL文件路径
+        target_track_ids: 目标track_id集合
+        
+    Returns:
+        list: 匹配的数据列表
+    """
+    matched_data = []
+    file_path = Path(jsonl_file)
+    
+    if not file_path.exists():
+        print(f"❌ 文件不存在: {file_path}")
+        return matched_data
+    
+    print(f"📖 正在从 {file_path.name} 中提取目标数据...")
+    
+    line_count = 0
+    found_count = 0
+    
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line_num, line in enumerate(f, 1):
+                line = line.strip()
+                if not line:
+                    continue
+                    
+                line_count += 1
+                
+                # 每处理1000行显示进度
+                if line_count % 1000 == 0:
+                    print(f"  📊 已处理 {line_count} 行，找到 {found_count} 条目标数据...")
+                
+                try:
+                    data = json.loads(line)
+                    track_id = data.get('track_id')
+                    
+                    if track_id in target_track_ids:
+                        matched_data.append(data)
+                        found_count += 1
+                        
+                except json.JSONDecodeError as e:
+                    print(f"  ⚠️ 第 {line_num} 行JSON解析错误: {e}")
+                    continue
+                    
+    except Exception as e:
+        print(f"❌ 读取文件时出错: {e}")
+        return []
+    
+    print(f"  ✅ 共找到 {len(matched_data)} 条目标数据")
+    return matched_data
+
+def main():
+    """主函数"""
+    # 默认输入文件
+    file1_default = "data/filtered_normal_data_1883.jsonl"
+    file2_default = "data/WebMainBench_1827_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl"
+    
+    # 检查命令行参数
+    if len(sys.argv) >= 3:
+        file1 = sys.argv[1]
+        file2 = sys.argv[2]
+    else:
+        file1 = file1_default
+        file2 = file2_default
+    
+    print("=" * 80)
+    print("🔍 比较JSONL文件中的track_id差异")
+    print("=" * 80)
+    print(f"📁 文件1 (源文件): {file1}")
+    print(f"📁 文件2 (对比文件): {file2}")
+    print(f"🎯 目标: 找出在文件1中存在但在文件2中不存在的track_id数据")
+    print()
+    
+    # 步骤1: 加载文件1的所有track_id
+    print("🔸 步骤1: 加载文件1的track_id...")
+    track_ids_file1 = load_track_ids(file1)
+    
+    if not track_ids_file1:
+        print("❌ 文件1中没有找到有效的track_id")
+        return
+    
+    print()
+    
+    # 步骤2: 加载文件2的所有track_id
+    print("🔸 步骤2: 加载文件2的track_id...")
+    track_ids_file2 = load_track_ids(file2)
+    
+    if not track_ids_file2:
+        print("❌ 文件2中没有找到有效的track_id")
+        return
+    
+    print()
+    
+    # 步骤3: 计算差集
+    print("🔸 步骤3: 计算差集...")
+    diff_track_ids = track_ids_file1 - track_ids_file2
+    common_track_ids = track_ids_file1 & track_ids_file2
+    
+    print(f"  📊 文件1中的track_id数量: {len(track_ids_file1):,}")
+    print(f"  📊 文件2中的track_id数量: {len(track_ids_file2):,}")
+    print(f"  📊 共同的track_id数量: {len(common_track_ids):,}")
+    print(f"  ⭐ 差异的track_id数量: {len(diff_track_ids):,}")
+    
+    if not diff_track_ids:
+        print("\n🎉 没有发现差异！文件1中的所有track_id在文件2中都存在。")
+        return
+    
+    print()
+    
+    # 步骤4: 提取差异数据
+    print("🔸 步骤4: 提取差异数据...")
+    diff_data = load_data_with_track_ids(file1, diff_track_ids)
+    
+    if not diff_data:
+        print("❌ 没有找到差异数据")
+        return
+    
+    print()
+    
+    # 步骤5: 保存结果
+    print("🔸 步骤5: 保存差异数据...")
+    output_file = "data/track_id_diff_result.jsonl"
+    
+    try:
+        with open(output_file, 'w', encoding='utf-8') as f:
+            for data in diff_data:
+                f.write(json.dumps(data, ensure_ascii=False) + '\n')
+        
+        print(f"✅ 已保存 {len(diff_data)} 条差异数据到: {output_file}")
+        
+        # 显示前几个差异的track_id作为示例
+        print(f"\n📋 差异track_id示例 (前10个):")
+        for i, track_id in enumerate(list(diff_track_ids)[:10], 1):
+            print(f"  {i}. {track_id}")
+        
+        if len(diff_track_ids) > 10:
+            print(f"  ... 还有 {len(diff_track_ids) - 10} 个")
+            
+    except Exception as e:
+        print(f"❌ 保存文件时出错: {e}")
+        return
+    
+    print("\n" + "=" * 80)
+    print("🎉 比较完成!")
+    print("=" * 80)
+
+if __name__ == "__main__":
+    main()