Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions skills/fetch-url/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,27 @@ uv run playwright install chromium
- `--timeout-ms`:Playwright 导航超时(毫秒,默认 60000)。
- `--browser-path`:指定本地 Chromium 系浏览器路径(默认自动探测)。
- `--output-format`:输出格式(默认 `markdown`),支持 `csv`、`html`、`json`、`markdown`、`raw-html`、`txt`、`xml`、`xmltei`;`raw-html` 直接输出渲染后的 HTML(不经 trafilatura)。
- `--fetch-strategy`:仅 `markdown` 可用,支持 `auto`、`agent`、`jina`、`browser`。默认 `auto`。
- `--disable-twitter-api`:关闭 Twitter/X 的 FxTwitter API 优化路径。

Markdown 抓取顺序:
- Twitter/X 推文链接:默认先走 FxTwitter API。
- 其它 Markdown 请求:`--fetch-strategy auto` 时先尝试原站 `Accept: text/markdown` 协商,再尝试 Jina Reader:`https://r.jina.ai/<URL>`,最后回退到本地 Playwright 渲染并提取。
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
- 如需更明确控制兜底方式,可手工指定:
- `--fetch-strategy agent`:只尝试原站 Markdown 协商。
- `--fetch-strategy jina`:只尝试 Jina Reader。
- `--fetch-strategy browser`:直接走本地 Playwright。

疑似限流/拦截页检测:
- `auto` 模式下,如果原站协商或 Jina Reader 返回的其实是限流/验证码/拦截提示,而不是正文,脚本会把它视为不可用结果并继续兜底。
- 当前会检测常见关键词,例如 `rate limit`、`too many requests`、`access denied`、`captcha`、`cloudflare`、`verify you are human`。
- 如果明确知道某个 reader 的结果不可用,agent 可以直接切换到更兜底的 `--fetch-strategy browser`。

Jina Reader:
- 脚本会读取环境变量 `JINA_API_KEY`;如果存在,就以 `Authorization: Bearer <token>` 方式传给 Jina Reader。
- 不设置 `JINA_API_KEY` 也能用 Jina Reader,但官方公开配额较低;当前按更保守口径可认为无 Key 时大约 `20 RPM`。
- 如果遇到 Jina Reader 限流,可提示用户配置 `JINA_API_KEY` 以提升配额;当前官方 Reader 产品页给出的普通 API Key 配额是 `500 RPM`,Premium 是 `5000 RPM`。

Twitter/X 特化(仅 `markdown`):
- 当 URL 命中 `x.com`/`twitter.com` 推文链接且未设置 `--disable-twitter-api`,脚本会优先调用 `https://api.fxtwitter.com/2/status/{id}`。
- 当 FxTwitter 返回 `thread` 数据时,Markdown 会附加 `## Thread` 小节,按顺序列出 thread 内其它推文(自动去重主推文)。
Expand All @@ -39,6 +58,9 @@ Twitter/X 特化(仅 `markdown`):

```bash
./scripts/fetch_url.py https://example.com --output ./page.md --timeout-ms 60000
./scripts/fetch_url.py https://example.com --fetch-strategy jina
JINA_API_KEY=your-token ./scripts/fetch_url.py https://example.com --fetch-strategy jina
./scripts/fetch_url.py https://example.com --fetch-strategy browser
./scripts/fetch_url.py https://x.com/jack/status/20 --output-format markdown
./scripts/fetch_url.py https://x.com/jack/status/20 --output-format markdown --disable-twitter-api
```
Expand Down
131 changes: 130 additions & 1 deletion skills/fetch-url/scripts/fetch_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from __future__ import annotations

import json
import os
from pathlib import Path
import re
from typing import Any, Literal
Expand All @@ -33,6 +34,7 @@
CONSOLE = Console()

OutputFormat = Literal["csv", "html", "json", "markdown", "raw-html", "txt", "xml", "xmltei"]
FetchStrategy = Literal["auto", "agent", "jina", "browser"]
TWITTER_HOSTS = {
"x.com",
"www.x.com",
Expand All @@ -42,6 +44,21 @@
"mobile.twitter.com",
}
FXTWITTER_API_ROOT = "https://api.fxtwitter.com/2/status"
JINA_READER_API_ROOT = "https://r.jina.ai/"
JINA_API_KEY_ENV = "JINA_API_KEY"
BLOCKED_CONTENT_PATTERNS = (
r"\brate limit(?:ed|ing)?\b",
r"\btoo many requests\b",
r"\baccess denied\b",
r"\brequest blocked\b",
r"\btemporarily blocked\b",
r"\bverify you are human\b",
r"\bcaptcha\b",
r"\bcloudflare\b",
r"\bplease enable javascript and cookies\b",
r"\bunusual traffic\b",
r"\bsecurity check\b",
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
# FxTwitter source repository: https://github.com/allnodes/FxTwitter


Expand Down Expand Up @@ -209,6 +226,76 @@ def fetch_agent_markdown(url: str, timeout_ms: int, verbose: bool) -> str | None
return None


def fetch_jina_reader_markdown(url: str, timeout_ms: int, verbose: bool) -> str | None:
"""通过 Jina Reader 获取 Markdown,命中则直接返回。"""

reader_url = f"{JINA_READER_API_ROOT}{url}"
api_key = os.getenv(JINA_API_KEY_ENV, "").strip()
headers = {
"Accept": "text/markdown, text/plain;q=0.9, */*;q=0.1",
"User-Agent": "fetch-url/1.0 (+https://github.com/DCjanus/prompts/tree/master/skills/fetch-url)",
}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"

if verbose:
auth_mode = f"with {JINA_API_KEY_ENV}" if api_key else "without API key"
CONSOLE.print(
f"[cyan]Trying Jina Reader[/cyan] {auth_mode}",
highlight=False,
)

request = Request( # noqa: S310 - 本地 CLI 可信输入, URL 由用户主动提供
reader_url,
headers=headers,
Comment thread
coderabbitai[bot] marked this conversation as resolved.
)
try:
with urlopen(request, timeout=max(timeout_ms / 1000.0, 1.0)) as response: # noqa: S310
charset = response.headers.get_content_charset() or "utf-8"
markdown = response.read().decode(charset, errors="replace")
if not markdown.strip():
return None
if verbose:
CONSOLE.print(
f"[green]Jina Reader hit[/green] {len(markdown)} chars",
highlight=False,
)
return markdown
except (URLError, OSError) as exc:
if verbose:
CONSOLE.print(
f"[yellow]Jina Reader failed, fallback to browser render[/yellow] ({exc})",
highlight=False,
Comment thread
coderabbitai[bot] marked this conversation as resolved.
)
return None


def looks_like_blocked_content(content: str) -> bool:
"""启发式识别限流、拦截或验证码提示页。"""

snippet = content[:4000].lower()
return any(re.search(pattern, snippet) for pattern in BLOCKED_CONTENT_PATTERNS)


def validate_markdown_candidate(
content: str | None,
source_name: str,
verbose: bool,
) -> str | None:
"""过滤明显的空结果或限流提示页。"""

if content is None:
return None
if looks_like_blocked_content(content):
if verbose:
CONSOLE.print(
f"[yellow]{source_name} returned suspected blocked content, continue fallback[/yellow]",
highlight=False,
)
return None
return content


def extract_twitter_status_id(url: str) -> str | None:
"""从 x.com/twitter.com 推文链接提取 status id。"""

Expand Down Expand Up @@ -432,6 +519,10 @@ def fetch(
"markdown",
help="Output format: csv, html, json, markdown, raw-html, txt, xml, xmltei.",
),
fetch_strategy: FetchStrategy = typer.Option(
"auto",
help="Fetch strategy for markdown: auto, agent, jina, browser.",
),
disable_twitter_api: bool = typer.Option(
False,
"--disable-twitter-api",
Expand All @@ -443,6 +534,8 @@ def fetch(
parsed = urlparse(url)
if parsed.scheme not in {"http", "https"}:
raise typer.BadParameter("Only http or https URLs are supported.")
if output_format != "markdown" and fetch_strategy != "auto":
raise typer.BadParameter("Custom fetch strategy is only supported with markdown output.")

resolved_browser_path = str(browser_path) if browser_path else detect_browser_path()
try:
Expand All @@ -465,7 +558,43 @@ def fetch(
CONSOLE.print("[green]Using FxTwitter API markdown path[/green]", highlight=False)
if output_format == "markdown":
if content is None:
content = fetch_agent_markdown(url, timeout_ms=timeout_ms, verbose=verbose)
if fetch_strategy == "auto":
content = validate_markdown_candidate(
fetch_agent_markdown(url, timeout_ms=timeout_ms, verbose=verbose),
source_name="Markdown negotiation",
verbose=verbose,
)
if content is None:
content = validate_markdown_candidate(
fetch_jina_reader_markdown(url, timeout_ms=timeout_ms, verbose=verbose),
source_name="Jina Reader",
verbose=verbose,
)
elif fetch_strategy == "agent":
content = validate_markdown_candidate(
fetch_agent_markdown(url, timeout_ms=timeout_ms, verbose=verbose),
source_name="Markdown negotiation",
verbose=verbose,
)
if content is None:
raise ValueError(
"Markdown negotiation did not return usable content. "
"Try --fetch-strategy jina or --fetch-strategy browser."
)
elif fetch_strategy == "jina":
content = validate_markdown_candidate(
fetch_jina_reader_markdown(url, timeout_ms=timeout_ms, verbose=verbose),
source_name="Jina Reader",
verbose=verbose,
)
if content is None:
raise ValueError(
"Jina Reader did not return usable content. "
f"If this is rate limiting, configure {JINA_API_KEY_ENV} or try "
"--fetch-strategy browser."
)
elif fetch_strategy == "browser" and verbose:
CONSOLE.print("[cyan]Skipping non-browser markdown readers[/cyan]", highlight=False)
if content is None:
html = render_html(
url,
Expand Down