Merge pull request #31 from e06084/main

e06084 · web-flow · commit a3ab1a287169 · 2025-08-25T13:30:52.000+08:00
update llm_webkit_extractor
diff --git a/examples/basic_usage.py b/examples/basic_usage.py
@@ -957,7 +957,7 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
     print("1. 从真实数据集加载预处理HTML数据...")
     
     # 使用DataLoader加载真实的样本数据
-    dataset_path = Path("/home/lulindong/Pycharm_projects/cc/WebMainBench_1848_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl")
+    dataset_path = Path("data/WebMainBench_dataset_merge_with_llm_webkit.jsonl")
     print(f"📂 数据集文件: {dataset_path}")
     
     if not dataset_path.exists():
diff --git a/webmainbench/extractors/llm_webkit_extractor.py b/webmainbench/extractors/llm_webkit_extractor.py
@@ -589,12 +589,12 @@ def _extract_content_from_main_html(self, main_html: str, url: str = None) -> tu
         """使用llm-webkit的方法将main_html提取成content"""
         import traceback
         try:
-            from llm_web_kit.simple import extract_html_to_md
+            from llm_web_kit.simple import extract_content_from_main_html
             
             print(f"🔧 开始使用llm-webkit简单接口提取content...")
             
             # 使用简单接口提取markdown，传入URL
-            content = extract_html_to_md(url or "", main_html, clip_html=False)
+            content = extract_content_from_main_html(url or "", main_html)
             
             print(f"✅ llm-webkit提取完成: {len(content)}字符")