@@ -67,17 +67,53 @@ def _load_file(path: Path) -> Optional[LoadedDoc]:
6767
6868def _load_url (url : str ) -> Optional [LoadedDoc ]:
6969 try :
70- # Try to convert to raw URL for better content extraction (e.g. GitHub)
70+ # 1. GitHub 转换: 尝试将 GitHub Blob URL 转为 Raw URL,以便获取纯内容
7171 target_url = _try_convert_github_url (url )
7272
73+ # 2. 策略判断:
74+ # 如果是 Github Raw 链接或常见的纯文本/代码文件后缀,直接下载更高效且精准。
75+ # 否则 (通用网页),尝试使用 Jina Reader 将 HTML 转换为高质量 Markdown。
76+
77+ # 常见纯文本/代码后缀,不需要 LLM Reader 进行清理
78+ raw_extensions = (
79+ ".txt" , ".md" , ".json" , ".yaml" , ".yml" , ".csv" , ".xml" , ".ini" , ".conf" ,
80+ ".py" , ".js" , ".ts" , ".go" , ".rs" , ".java" , ".c" , ".cpp" , ".h" , ".cs" , ".php" , ".rb" , ".sh"
81+ )
82+
83+ is_github_raw = "raw.githubusercontent.com" in target_url
84+ is_pure_text = target_url .lower ().endswith (raw_extensions )
85+
86+ should_use_jina = not (is_github_raw or is_pure_text )
87+
88+ if should_use_jina :
89+ # 3. 尝试 Jina Reader (https://jina.ai/reader)
90+ # 它可以将杂乱的网页转换为干净的 Markdown,非常适合 RAG
91+ jina_api_key = os .getenv ("JINA_API_KEY" )
92+ headers = {"X-Retain-Images" : "none" }
93+ if jina_api_key :
94+ headers ["Authorization" ] = f"Bearer { jina_api_key } "
95+
96+ try :
97+ jina_url = f"https://r.jina.ai/{ target_url } "
98+ r_jina = requests .get (jina_url , headers = headers , timeout = 20 )
99+ if r_jina .status_code == 200 :
100+ return LoadedDoc (id = url , text = r_jina .text , source = url )
101+ except Exception :
102+ # 如果 Jina 服务超时或失败,静默回退到普通下载
103+ pass
104+
105+ # 4. 回退/默认路径: 直接请求目标 URL
106+ # 适用于 Jina 失败、或者是直接下载路径 (GitHub Raw/Text files)
73107 r = requests .get (target_url , timeout = 20 )
74108 r .raise_for_status ()
109+
75110 content_type = r .headers .get ("content-type" , "" ).lower ()
76- text : str
77111 if "html" in content_type :
112+ # 使用简易方式去除标签作为保底
78113 text = _strip_html (r .text )
79114 else :
80115 text = r .text
116+
81117 return LoadedDoc (id = url , text = text , source = url )
82118 except Exception :
83119 return None
0 commit comments