Skip to content

Commit a3ab1a2

Browse files
authored
Merge pull request #31 from e06084/main
update llm_webkit_extractor
2 parents 5dd4c52 + d45dceb commit a3ab1a2

File tree

2 files changed

+3
-3
lines changed

2 files changed

+3
-3
lines changed

examples/basic_usage.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -957,7 +957,7 @@ def demo_llm_webkit_with_preprocessed_html_evaluation():
957957
print("1. 从真实数据集加载预处理HTML数据...")
958958

959959
# 使用DataLoader加载真实的样本数据
960-
dataset_path = Path("/home/lulindong/Pycharm_projects/cc/WebMainBench_1848_v1_WebMainBench_dataset_merge_with_llm_webkit.jsonl")
960+
dataset_path = Path("data/WebMainBench_dataset_merge_with_llm_webkit.jsonl")
961961
print(f"📂 数据集文件: {dataset_path}")
962962

963963
if not dataset_path.exists():

webmainbench/extractors/llm_webkit_extractor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -589,12 +589,12 @@ def _extract_content_from_main_html(self, main_html: str, url: str = None) -> tu
589589
"""使用llm-webkit的方法将main_html提取成content"""
590590
import traceback
591591
try:
592-
from llm_web_kit.simple import extract_html_to_md
592+
from llm_web_kit.simple import extract_content_from_main_html
593593

594594
print(f"🔧 开始使用llm-webkit简单接口提取content...")
595595

596596
# 使用简单接口提取markdown,传入URL
597-
content = extract_html_to_md(url or "", main_html, clip_html=False)
597+
content = extract_content_from_main_html(url or "", main_html)
598598

599599
print(f"✅ llm-webkit提取完成: {len(content)}字符")
600600

0 commit comments

Comments
 (0)