Skip to content

Commit f1d3a88

Browse files
authored
feat(annotate): try Accept: text/markdown before Jina/Turndown (#557)
Sites that support Cloudflare's "Markdown for Agents" (or similar content negotiation) now return markdown directly, skipping the Jina Reader and Turndown conversion entirely. The negotiation attempt uses a short 5s timeout so failures fall through quickly to the existing pipeline. For provenance purposes, this commit was AI assisted.
1 parent e22c911 commit f1d3a88

2 files changed

Lines changed: 240 additions & 2 deletions

File tree

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
import { test, expect, mock, beforeEach, afterEach } from "bun:test";
2+
import { urlToMarkdown } from "./url-to-markdown";
3+
4+
// Track fetch calls to verify headers and URL selection
5+
let fetchCalls: { url: string; headers: Record<string, string> }[] = [];
6+
const originalFetch = globalThis.fetch;
7+
8+
beforeEach(() => {
9+
fetchCalls = [];
10+
});
11+
12+
afterEach(() => {
13+
globalThis.fetch = originalFetch;
14+
});
15+
16+
/**
17+
* Create a mock fetch that responds based on the Accept header.
18+
* When Accept includes text/markdown, returns markdown with the right content-type.
19+
* Otherwise returns HTML.
20+
*/
21+
function mockFetchWithMarkdownSupport(markdown: string) {
22+
return mock((url: string | URL | Request, init?: RequestInit) => {
23+
const headers = init?.headers as Record<string, string> | undefined;
24+
fetchCalls.push({ url: String(url), headers: headers ?? {} });
25+
26+
const accept = headers?.Accept || headers?.accept || "";
27+
if (accept.includes("text/markdown")) {
28+
return Promise.resolve(
29+
new Response(markdown, {
30+
status: 200,
31+
headers: {
32+
"content-type": "text/markdown; charset=utf-8",
33+
"x-markdown-tokens": "42",
34+
},
35+
}),
36+
);
37+
}
38+
return Promise.resolve(
39+
new Response("<html><body><p>Hello</p></body></html>", {
40+
status: 200,
41+
headers: { "content-type": "text/html; charset=utf-8" },
42+
}),
43+
);
44+
});
45+
}
46+
47+
/** Mock fetch that only returns HTML (no markdown support). */
48+
function mockFetchHtmlOnly(html = "<html><body><p>Fallback</p></body></html>") {
49+
return mock((url: string | URL | Request, init?: RequestInit) => {
50+
const headers = init?.headers as Record<string, string> | undefined;
51+
fetchCalls.push({ url: String(url), headers: headers ?? {} });
52+
return Promise.resolve(
53+
new Response(html, {
54+
status: 200,
55+
headers: { "content-type": "text/html; charset=utf-8" },
56+
}),
57+
);
58+
});
59+
}
60+
61+
test("content negotiation: uses markdown when server supports it", async () => {
62+
const md = "# Hello\n\nThis is markdown from the server.";
63+
globalThis.fetch = mockFetchWithMarkdownSupport(md) as typeof fetch;
64+
65+
const result = await urlToMarkdown("https://example.com/page", { useJina: true });
66+
67+
expect(result.source).toBe("content-negotiation");
68+
expect(result.markdown).toBe(md);
69+
// Should only make one fetch (the content negotiation request)
70+
expect(fetchCalls).toHaveLength(1);
71+
expect(fetchCalls[0].headers.Accept).toContain("text/markdown");
72+
});
73+
74+
test("content negotiation: falls through to Jina when server returns HTML", async () => {
75+
// First call (content negotiation) returns HTML, second (Jina) returns markdown
76+
let callCount = 0;
77+
globalThis.fetch = mock((url: string | URL | Request, init?: RequestInit) => {
78+
const headers = init?.headers as Record<string, string> | undefined;
79+
fetchCalls.push({ url: String(url), headers: headers ?? {} });
80+
callCount++;
81+
82+
if (callCount === 1) {
83+
// Content negotiation attempt — server doesn't support it
84+
return Promise.resolve(
85+
new Response("<html><body>Hi</body></html>", {
86+
status: 200,
87+
headers: { "content-type": "text/html; charset=utf-8" },
88+
}),
89+
);
90+
}
91+
// Jina Reader call
92+
return Promise.resolve(
93+
new Response("# From Jina", {
94+
status: 200,
95+
headers: { "content-type": "text/plain" },
96+
}),
97+
);
98+
}) as typeof fetch;
99+
100+
const result = await urlToMarkdown("https://example.com/page", { useJina: true });
101+
102+
expect(result.source).toBe("jina");
103+
expect(result.markdown).toBe("# From Jina");
104+
// Content negotiation fetch + Jina fetch
105+
expect(fetchCalls.length).toBeGreaterThanOrEqual(2);
106+
expect(fetchCalls[1].url).toContain("r.jina.ai");
107+
});
108+
109+
test("content negotiation: skipped for local URLs", async () => {
110+
let callCount = 0;
111+
globalThis.fetch = mock((_url: string | URL | Request, init?: RequestInit) => {
112+
const headers = init?.headers as Record<string, string> | undefined;
113+
fetchCalls.push({ url: String(_url), headers: headers ?? {} });
114+
callCount++;
115+
return Promise.resolve(
116+
new Response("<html><body>Local</body></html>", {
117+
status: 200,
118+
headers: { "content-type": "text/html; charset=utf-8" },
119+
}),
120+
);
121+
}) as typeof fetch;
122+
123+
const result = await urlToMarkdown("http://localhost:3000/readme", { useJina: false });
124+
125+
expect(result.source).toBe("fetch+turndown");
126+
// No content negotiation request should have been made
127+
// (first call should be the Turndown fetch, not a markdown request)
128+
for (const call of fetchCalls) {
129+
if (call.headers.Accept) {
130+
expect(call.headers.Accept).not.toContain("text/markdown");
131+
}
132+
}
133+
});
134+
135+
test("content negotiation: handles server error gracefully", async () => {
136+
let callCount = 0;
137+
globalThis.fetch = mock((_url: string | URL | Request, init?: RequestInit) => {
138+
const headers = init?.headers as Record<string, string> | undefined;
139+
fetchCalls.push({ url: String(_url), headers: headers ?? {} });
140+
callCount++;
141+
142+
if (callCount === 1) {
143+
// Content negotiation — server error
144+
return Promise.resolve(new Response(null, { status: 500 }));
145+
}
146+
// Jina fallback
147+
return Promise.resolve(
148+
new Response("# Jina fallback", {
149+
status: 200,
150+
headers: { "content-type": "text/plain" },
151+
}),
152+
);
153+
}) as typeof fetch;
154+
155+
const result = await urlToMarkdown("https://example.com/page", { useJina: true });
156+
157+
// Should fall through to Jina
158+
expect(result.source).toBe("jina");
159+
});
160+
161+
test("raw .md URL: still takes priority over content negotiation", async () => {
162+
globalThis.fetch = mock((_url: string | URL | Request, init?: RequestInit) => {
163+
const headers = init?.headers as Record<string, string> | undefined;
164+
fetchCalls.push({ url: String(_url), headers: headers ?? {} });
165+
return Promise.resolve(
166+
new Response("# Raw markdown file", {
167+
status: 200,
168+
headers: { "content-type": "text/plain; charset=utf-8" },
169+
}),
170+
);
171+
}) as typeof fetch;
172+
173+
const result = await urlToMarkdown("https://example.com/README.md", { useJina: true });
174+
175+
expect(result.source).toBe("fetch-raw");
176+
expect(result.markdown).toBe("# Raw markdown file");
177+
});

packages/shared/url-to-markdown.ts

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ export interface UrlToMarkdownOptions {
1414

1515
export interface UrlToMarkdownResult {
1616
markdown: string;
17-
source: "jina" | "fetch+turndown" | "fetch-raw";
17+
source: "jina" | "fetch+turndown" | "fetch-raw" | "content-negotiation";
1818
}
1919

2020
const FETCH_TIMEOUT_MS = 30_000;
@@ -86,7 +86,17 @@ export async function urlToMarkdown(
8686
// Server returned HTML for this .md URL — fall through to normal conversion
8787
}
8888

89-
if (options.useJina && !isLocalUrl(url)) {
89+
// Content negotiation fast path — if the server natively returns markdown
90+
// (e.g. Cloudflare's Markdown for Agents), skip Jina/Turndown entirely.
91+
const local = isLocalUrl(url);
92+
if (!local) {
93+
const negotiated = await fetchViaContentNegotiation(url);
94+
if (negotiated !== null) {
95+
return { markdown: negotiated, source: "content-negotiation" };
96+
}
97+
}
98+
99+
if (options.useJina && !local) {
90100
try {
91101
const markdown = await fetchViaJina(url);
92102
return { markdown, source: "jina" };
@@ -190,6 +200,57 @@ async function fetchRawText(url: string): Promise<string | null> {
190200
}
191201
}
192202

203+
/**
204+
* Content negotiation fast path — request `text/markdown` via the Accept header.
205+
* Sites that support Cloudflare's "Markdown for Agents" (or similar) will return
206+
* markdown directly, letting us skip Jina and Turndown entirely.
207+
* Returns null if the server doesn't serve markdown.
208+
*/
209+
const NEGOTIATION_TIMEOUT_MS = 5_000; // Short timeout — this is a best-effort optimization
210+
211+
async function fetchViaContentNegotiation(url: string): Promise<string | null> {
212+
const controller = new AbortController();
213+
const timer = setTimeout(() => controller.abort(), NEGOTIATION_TIMEOUT_MS);
214+
const headers = {
215+
"User-Agent": "Mozilla/5.0 (compatible; Plannotator/1.0; +https://plannotator.ai)",
216+
Accept: "text/markdown, text/html;q=0.9",
217+
};
218+
219+
try {
220+
let currentUrl = url;
221+
let res = await fetch(currentUrl, { headers, redirect: "manual", signal: controller.signal });
222+
223+
for (let i = 0; i < MAX_REDIRECTS && REDIRECT_STATUSES.has(res.status); i++) {
224+
const location = res.headers.get("location");
225+
if (!location) break;
226+
currentUrl = new URL(location, currentUrl).href;
227+
if (isLocalUrl(currentUrl)) {
228+
res.body?.cancel();
229+
return null;
230+
}
231+
res.body?.cancel();
232+
res = await fetch(currentUrl, { headers, redirect: "manual", signal: controller.signal });
233+
}
234+
235+
if (!res.ok) {
236+
res.body?.cancel();
237+
return null;
238+
}
239+
240+
const ct = res.headers.get("content-type") || "";
241+
if (!ct.includes("text/markdown")) {
242+
res.body?.cancel();
243+
return null;
244+
}
245+
246+
return await readBodyWithLimit(res);
247+
} catch {
248+
return null;
249+
} finally {
250+
clearTimeout(timer);
251+
}
252+
}
253+
193254
/** Fetch via Jina Reader — returns markdown directly. */
194255
async function fetchViaJina(url: string): Promise<string> {
195256
// Strip fragment (never sent to server) and encode for Jina's path-based API

0 commit comments

Comments
 (0)