1313from __future__ import annotations
1414
1515import json
16+ import os
1617from pathlib import Path
1718import re
1819from typing import Any , Literal
3334CONSOLE = Console ()
3435
3536OutputFormat = Literal ["csv" , "html" , "json" , "markdown" , "raw-html" , "txt" , "xml" , "xmltei" ]
37+ FetchStrategy = Literal ["auto" , "agent" , "jina" , "browser" ]
3638TWITTER_HOSTS = {
3739 "x.com" ,
3840 "www.x.com" ,
4244 "mobile.twitter.com" ,
4345}
4446FXTWITTER_API_ROOT = "https://api.fxtwitter.com/2/status"
47+ JINA_READER_API_ROOT = "https://r.jina.ai/"
48+ JINA_API_KEY_ENV = "JINA_API_KEY"
49+ JINA_BLOCK_PAGE_SIGNALS = (
50+ ("rate limit" , "jina" ),
51+ ("too many requests" , "jina" ),
52+ ("rate limit exceeded" , "jina" ),
53+ ("retry after" , "jina" ),
54+ ("request limit reached" , "jina" ),
55+ ("security verification" , "jina" ),
56+ )
4557# FxTwitter source repository: https://github.com/allnodes/FxTwitter
4658
4759
@@ -209,6 +221,64 @@ def fetch_agent_markdown(url: str, timeout_ms: int, verbose: bool) -> str | None
209221 return None
210222
211223
224+ def fetch_jina_reader_markdown (url : str , timeout_ms : int , verbose : bool ) -> str | None :
225+ """通过 Jina Reader 获取 Markdown, 命中则直接返回。"""
226+
227+ reader_url = f"{ JINA_READER_API_ROOT } { url } "
228+ api_key = os .getenv (JINA_API_KEY_ENV , "" ).strip ()
229+ headers = {
230+ "Accept" : "text/markdown, text/plain;q=0.9, */*;q=0.1" ,
231+ "User-Agent" : "fetch-url/1.0 (+https://github.com/DCjanus/prompts/tree/master/skills/fetch-url)" ,
232+ }
233+ if api_key :
234+ headers ["Authorization" ] = f"Bearer { api_key } "
235+
236+ if verbose :
237+ auth_mode = f"with { JINA_API_KEY_ENV } " if api_key else "without API key"
238+ CONSOLE .print (
239+ f"[cyan]Trying Jina Reader[/cyan] { auth_mode } " ,
240+ highlight = False ,
241+ )
242+
243+ request = Request ( # noqa: S310 - 本地 CLI 可信输入, URL 由用户主动提供
244+ reader_url ,
245+ headers = headers ,
246+ )
247+ try :
248+ with urlopen (request , timeout = max (timeout_ms / 1000.0 , 1.0 )) as response : # noqa: S310
249+ charset = response .headers .get_content_charset () or "utf-8"
250+ markdown = response .read ().decode (charset , errors = "replace" )
251+ if not markdown .strip ():
252+ return None
253+ if is_obvious_jina_block_page (markdown ):
254+ if verbose :
255+ CONSOLE .print (
256+ "[yellow]Jina Reader returned a probable rate-limit page[/yellow]" ,
257+ highlight = False ,
258+ )
259+ return None
260+ if verbose :
261+ CONSOLE .print (
262+ f"[green]Jina Reader hit[/green] { len (markdown )} chars" ,
263+ highlight = False ,
264+ )
265+ return markdown
266+ except (URLError , OSError ) as exc :
267+ if verbose :
268+ CONSOLE .print (
269+ f"[yellow]Jina Reader failed[/yellow] ({ exc } )" ,
270+ highlight = False ,
271+ )
272+ return None
273+
274+
275+ def is_obvious_jina_block_page (content : str ) -> bool :
276+ """识别少数非常明显的 Jina 限流或挑战页。"""
277+
278+ snippet = content [:4000 ].lower ()
279+ return any (all (part in snippet for part in signal ) for signal in JINA_BLOCK_PAGE_SIGNALS )
280+
281+
212282def extract_twitter_status_id (url : str ) -> str | None :
213283 """从 x.com/twitter.com 推文链接提取 status id。"""
214284
@@ -432,22 +502,23 @@ def fetch(
432502 "markdown" ,
433503 help = "Output format: csv, html, json, markdown, raw-html, txt, xml, xmltei." ,
434504 ),
435- disable_twitter_api : bool = typer .Option (
436- False ,
437- "--disable-twitter-api" ,
438- help = "Disable FxTwitter API optimization for x.com/twitter.com links in markdown mode." ,
505+ fetch_strategy : FetchStrategy = typer .Option (
506+ "auto" ,
507+ help = "Fetch strategy for markdown: auto, agent, jina, browser." ,
439508 ),
440509 verbose : bool = typer .Option (False , "--verbose" , help = "Print progress and diagnostic logs." ),
441510) -> None :
442511 """通过 Playwright 渲染并用 trafilatura 提取内容。"""
443512 parsed = urlparse (url )
444513 if parsed .scheme not in {"http" , "https" }:
445514 raise typer .BadParameter ("Only http or https URLs are supported." )
515+ if output_format != "markdown" and fetch_strategy != "auto" :
516+ raise typer .BadParameter ("Custom fetch strategy is only supported with markdown output." )
446517
447518 resolved_browser_path = str (browser_path ) if browser_path else detect_browser_path ()
448519 try :
449520 content : str | None = None
450- if output_format == "markdown" and not disable_twitter_api :
521+ if output_format == "markdown" and fetch_strategy == "auto" :
451522 twitter_status_id = extract_twitter_status_id (url )
452523 if twitter_status_id :
453524 payload = fetch_fxtwitter_status (
@@ -458,14 +529,38 @@ def fetch(
458529 if payload is None :
459530 raise ValueError (
460531 "FxTwitter API request failed for this Twitter/X URL. "
461- "Use --disable-twitter-api to skip this path."
532+ "Use --fetch-strategy agent, jina, or browser to skip this path."
462533 )
463534 content = render_fxtwitter_markdown (payload , source_url = url )
464535 if verbose :
465536 CONSOLE .print ("[green]Using FxTwitter API markdown path[/green]" , highlight = False )
466537 if output_format == "markdown" :
467538 if content is None :
468- content = fetch_agent_markdown (url , timeout_ms = timeout_ms , verbose = verbose )
539+ if fetch_strategy == "auto" :
540+ content = fetch_agent_markdown (url , timeout_ms = timeout_ms , verbose = verbose )
541+ if content is None :
542+ content = fetch_jina_reader_markdown (
543+ url ,
544+ timeout_ms = timeout_ms ,
545+ verbose = verbose ,
546+ )
547+ elif fetch_strategy == "agent" :
548+ content = fetch_agent_markdown (url , timeout_ms = timeout_ms , verbose = verbose )
549+ if content is None :
550+ raise ValueError (
551+ "Markdown negotiation did not return usable content. "
552+ "Try --fetch-strategy jina or --fetch-strategy browser."
553+ )
554+ elif fetch_strategy == "jina" :
555+ content = fetch_jina_reader_markdown (url , timeout_ms = timeout_ms , verbose = verbose )
556+ if content is None :
557+ raise ValueError (
558+ "Jina Reader did not return usable content. "
559+ f"If this is rate limiting, configure { JINA_API_KEY_ENV } or try "
560+ "--fetch-strategy browser."
561+ )
562+ elif fetch_strategy == "browser" and verbose :
563+ CONSOLE .print ("[cyan]Skipping non-browser markdown readers[/cyan]" , highlight = False )
469564 if content is None :
470565 html = render_html (
471566 url ,
0 commit comments