1313from __future__ import annotations
1414
1515import json
16+ import os
1617from pathlib import Path
1718import re
1819from typing import Any , Literal
3334CONSOLE = Console ()
3435
3536OutputFormat = Literal ["csv" , "html" , "json" , "markdown" , "raw-html" , "txt" , "xml" , "xmltei" ]
37+ FetchStrategy = Literal ["auto" , "agent" , "jina" , "browser" ]
3638TWITTER_HOSTS = {
3739 "x.com" ,
3840 "www.x.com" ,
4244 "mobile.twitter.com" ,
4345}
4446FXTWITTER_API_ROOT = "https://api.fxtwitter.com/2/status"
47+ JINA_READER_API_ROOT = "https://r.jina.ai/"
48+ JINA_API_KEY_ENV = "JINA_API_KEY"
49+ BLOCKED_CONTENT_PATTERNS = (
50+ r"\brate limit(?:ed|ing)?\b" ,
51+ r"\btoo many requests\b" ,
52+ r"\baccess denied\b" ,
53+ r"\brequest blocked\b" ,
54+ r"\btemporarily blocked\b" ,
55+ r"\bverify you are human\b" ,
56+ r"\bcaptcha\b" ,
57+ r"\bcloudflare\b" ,
58+ r"\bplease enable javascript and cookies\b" ,
59+ r"\bunusual traffic\b" ,
60+ r"\bsecurity check\b" ,
61+ )
4562# FxTwitter source repository: https://github.com/allnodes/FxTwitter
4663
4764
@@ -209,6 +226,76 @@ def fetch_agent_markdown(url: str, timeout_ms: int, verbose: bool) -> str | None
209226 return None
210227
211228
229+ def fetch_jina_reader_markdown (url : str , timeout_ms : int , verbose : bool ) -> str | None :
230+ """通过 Jina Reader 获取 Markdown,命中则直接返回。"""
231+
232+ reader_url = f"{ JINA_READER_API_ROOT } { url } "
233+ api_key = os .getenv (JINA_API_KEY_ENV , "" ).strip ()
234+ headers = {
235+ "Accept" : "text/markdown, text/plain;q=0.9, */*;q=0.1" ,
236+ "User-Agent" : "fetch-url/1.0 (+https://github.com/DCjanus/prompts/tree/master/skills/fetch-url)" ,
237+ }
238+ if api_key :
239+ headers ["Authorization" ] = f"Bearer { api_key } "
240+
241+ if verbose :
242+ auth_mode = f"with { JINA_API_KEY_ENV } " if api_key else "without API key"
243+ CONSOLE .print (
244+ f"[cyan]Trying Jina Reader[/cyan] { auth_mode } " ,
245+ highlight = False ,
246+ )
247+
248+ request = Request ( # noqa: S310 - 本地 CLI 可信输入, URL 由用户主动提供
249+ reader_url ,
250+ headers = headers ,
251+ )
252+ try :
253+ with urlopen (request , timeout = max (timeout_ms / 1000.0 , 1.0 )) as response : # noqa: S310
254+ charset = response .headers .get_content_charset () or "utf-8"
255+ markdown = response .read ().decode (charset , errors = "replace" )
256+ if not markdown .strip ():
257+ return None
258+ if verbose :
259+ CONSOLE .print (
260+ f"[green]Jina Reader hit[/green] { len (markdown )} chars" ,
261+ highlight = False ,
262+ )
263+ return markdown
264+ except (URLError , OSError ) as exc :
265+ if verbose :
266+ CONSOLE .print (
267+ f"[yellow]Jina Reader failed, fallback to browser render[/yellow] ({ exc } )" ,
268+ highlight = False ,
269+ )
270+ return None
271+
272+
273+ def looks_like_blocked_content (content : str ) -> bool :
274+ """启发式识别限流、拦截或验证码提示页。"""
275+
276+ snippet = content [:4000 ].lower ()
277+ return any (re .search (pattern , snippet ) for pattern in BLOCKED_CONTENT_PATTERNS )
278+
279+
280+ def validate_markdown_candidate (
281+ content : str | None ,
282+ source_name : str ,
283+ verbose : bool ,
284+ ) -> str | None :
285+ """过滤明显的空结果或限流提示页。"""
286+
287+ if content is None :
288+ return None
289+ if looks_like_blocked_content (content ):
290+ if verbose :
291+ CONSOLE .print (
292+ f"[yellow]{ source_name } returned suspected blocked content, continue fallback[/yellow]" ,
293+ highlight = False ,
294+ )
295+ return None
296+ return content
297+
298+
212299def extract_twitter_status_id (url : str ) -> str | None :
213300 """从 x.com/twitter.com 推文链接提取 status id。"""
214301
@@ -432,6 +519,10 @@ def fetch(
432519 "markdown" ,
433520 help = "Output format: csv, html, json, markdown, raw-html, txt, xml, xmltei." ,
434521 ),
522+ fetch_strategy : FetchStrategy = typer .Option (
523+ "auto" ,
524+ help = "Fetch strategy for markdown: auto, agent, jina, browser." ,
525+ ),
435526 disable_twitter_api : bool = typer .Option (
436527 False ,
437528 "--disable-twitter-api" ,
@@ -443,6 +534,8 @@ def fetch(
443534 parsed = urlparse (url )
444535 if parsed .scheme not in {"http" , "https" }:
445536 raise typer .BadParameter ("Only http or https URLs are supported." )
537+ if output_format != "markdown" and fetch_strategy != "auto" :
538+ raise typer .BadParameter ("Custom fetch strategy is only supported with markdown output." )
446539
447540 resolved_browser_path = str (browser_path ) if browser_path else detect_browser_path ()
448541 try :
@@ -465,7 +558,43 @@ def fetch(
465558 CONSOLE .print ("[green]Using FxTwitter API markdown path[/green]" , highlight = False )
466559 if output_format == "markdown" :
467560 if content is None :
468- content = fetch_agent_markdown (url , timeout_ms = timeout_ms , verbose = verbose )
561+ if fetch_strategy == "auto" :
562+ content = validate_markdown_candidate (
563+ fetch_agent_markdown (url , timeout_ms = timeout_ms , verbose = verbose ),
564+ source_name = "Markdown negotiation" ,
565+ verbose = verbose ,
566+ )
567+ if content is None :
568+ content = validate_markdown_candidate (
569+ fetch_jina_reader_markdown (url , timeout_ms = timeout_ms , verbose = verbose ),
570+ source_name = "Jina Reader" ,
571+ verbose = verbose ,
572+ )
573+ elif fetch_strategy == "agent" :
574+ content = validate_markdown_candidate (
575+ fetch_agent_markdown (url , timeout_ms = timeout_ms , verbose = verbose ),
576+ source_name = "Markdown negotiation" ,
577+ verbose = verbose ,
578+ )
579+ if content is None :
580+ raise ValueError (
581+ "Markdown negotiation did not return usable content. "
582+ "Try --fetch-strategy jina or --fetch-strategy browser."
583+ )
584+ elif fetch_strategy == "jina" :
585+ content = validate_markdown_candidate (
586+ fetch_jina_reader_markdown (url , timeout_ms = timeout_ms , verbose = verbose ),
587+ source_name = "Jina Reader" ,
588+ verbose = verbose ,
589+ )
590+ if content is None :
591+ raise ValueError (
592+ "Jina Reader did not return usable content. "
593+ f"If this is rate limiting, configure { JINA_API_KEY_ENV } or try "
594+ "--fetch-strategy browser."
595+ )
596+ elif fetch_strategy == "browser" and verbose :
597+ CONSOLE .print ("[cyan]Skipping non-browser markdown readers[/cyan]" , highlight = False )
469598 if content is None :
470599 html = render_html (
471600 url ,
0 commit comments