Skip to content

Commit ab9f577

Browse files
LorisSigristbenmccannRich Harris
authored
feat: crawl URLs in <meta> tags (#9900)
* Crawl social-image urls during prerender * Formatting & Linting * Format changeset & added exhaustive list of crawlable urls * Changed severity to minor as described in #5228 * Added support for `property` attribute & limited valid names to just social tags * More tests * Better changeset message - I'm indecisive * Update .changeset/thirty-garlics-tan.md Co-authored-by: Ben McCann <[email protected]> * simplify * simplify * Removed redundant data-sanitation * DRY out --------- Co-authored-by: Ben McCann <[email protected]> Co-authored-by: Rich Harris <[email protected]>
1 parent 348029b commit ab9f577

File tree

4 files changed

+91
-43
lines changed

4 files changed

+91
-43
lines changed

.changeset/thirty-garlics-tan.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
'@sveltejs/kit': minor
3+
---
4+
5+
feat: crawl URLs in `<meta>` tags

packages/kit/src/core/postbuild/crawl.js

Lines changed: 68 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,20 @@ const ATTRIBUTE_NAME = /[^\t\n\f />"'=]/;
1313

1414
const WHITESPACE = /[\s\n\r]/;
1515

16+
const CRAWLABLE_META_NAME_ATTRS = new Set([
17+
'og:url',
18+
'og:image',
19+
'og:image:url',
20+
'og:image:secure_url',
21+
'og:video',
22+
'og:video:url',
23+
'og:video:secure_url',
24+
'og:audio',
25+
'og:audio:url',
26+
'og:audio:secure_url',
27+
'twitter:image'
28+
]);
29+
1630
/**
1731
* @param {string} html
1832
* @param {string} base
@@ -81,6 +95,9 @@ export function crawl(html, base) {
8195

8296
const tag = html.slice(start, i).toUpperCase();
8397

98+
/** @type {Record<string, string>} */
99+
const attributes = {};
100+
84101
if (tag === 'SCRIPT' || tag === 'STYLE') {
85102
while (i < html.length) {
86103
if (
@@ -95,9 +112,6 @@ export function crawl(html, base) {
95112
}
96113
}
97114

98-
let href = '';
99-
let rel = '';
100-
101115
while (i < html.length) {
102116
const start = i;
103117

@@ -159,44 +173,7 @@ export function crawl(html, base) {
159173
}
160174

161175
value = decode(value);
162-
163-
if (name === 'href') {
164-
if (tag === 'BASE') {
165-
base = resolve(base, value);
166-
} else {
167-
href = resolve(base, value);
168-
}
169-
} else if (name === 'id') {
170-
ids.push(value);
171-
} else if (name === 'name') {
172-
if (tag === 'A') ids.push(value);
173-
} else if (name === 'rel') {
174-
rel = value;
175-
} else if (name === 'src') {
176-
if (value) hrefs.push(resolve(base, value));
177-
} else if (name === 'srcset') {
178-
const candidates = [];
179-
let insideURL = true;
180-
value = value.trim();
181-
for (let i = 0; i < value.length; i++) {
182-
if (
183-
value[i] === ',' &&
184-
(!insideURL || (insideURL && WHITESPACE.test(value[i + 1])))
185-
) {
186-
candidates.push(value.slice(0, i));
187-
value = value.substring(i + 1).trim();
188-
i = 0;
189-
insideURL = true;
190-
} else if (WHITESPACE.test(value[i])) {
191-
insideURL = false;
192-
}
193-
}
194-
candidates.push(value);
195-
for (const candidate of candidates) {
196-
const src = candidate.split(WHITESPACE)[0];
197-
if (src) hrefs.push(resolve(base, src));
198-
}
199-
}
176+
attributes[name] = value;
200177
} else {
201178
i -= 1;
202179
}
@@ -205,8 +182,56 @@ export function crawl(html, base) {
205182
i += 1;
206183
}
207184

208-
if (href && !/\bexternal\b/i.test(rel)) {
209-
hrefs.push(resolve(base, href));
185+
const { href, id, name, property, rel, src, srcset, content } = attributes;
186+
187+
if (href) {
188+
if (tag === 'BASE') {
189+
base = resolve(base, href);
190+
} else if (!rel || !/\bexternal\b/i.test(rel)) {
191+
hrefs.push(resolve(base, href));
192+
}
193+
}
194+
195+
if (id) {
196+
ids.push(id);
197+
}
198+
199+
if (name && tag === 'A') {
200+
ids.push(name);
201+
}
202+
203+
if (src) {
204+
hrefs.push(resolve(base, src));
205+
}
206+
207+
if (srcset) {
208+
let value = srcset;
209+
const candidates = [];
210+
let insideURL = true;
211+
value = value.trim();
212+
for (let i = 0; i < value.length; i++) {
213+
if (value[i] === ',' && (!insideURL || (insideURL && WHITESPACE.test(value[i + 1])))) {
214+
candidates.push(value.slice(0, i));
215+
value = value.substring(i + 1).trim();
216+
i = 0;
217+
insideURL = true;
218+
} else if (WHITESPACE.test(value[i])) {
219+
insideURL = false;
220+
}
221+
}
222+
candidates.push(value);
223+
for (const candidate of candidates) {
224+
const src = candidate.split(WHITESPACE)[0];
225+
if (src) hrefs.push(resolve(base, src));
226+
}
227+
}
228+
229+
if (tag === 'META' && content) {
230+
const attr = name ?? property;
231+
232+
if (attr && CRAWLABLE_META_NAME_ATTRS.has(attr)) {
233+
hrefs.push(resolve(base, content));
234+
}
210235
}
211236
}
212237
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<meta name="viewport" content="width=device-width, initial-scale=1" />
5+
<meta name="description" content="This is a description" />
6+
7+
<!--Only these should get crawled-->
8+
<meta content="https://external.com" name="twitter:image" />
9+
<meta name="og:image" content="/og-image.jpg" />
10+
<meta property="og:audio" content="https://example.com/audio.mp3" />
11+
<meta content="/video.mp4" property="og:video"/>
12+
</head>
13+
<body></body>
14+
</html>
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"hrefs": ["https://external.com", "/og-image.jpg", "https://example.com/audio.mp3", "/video.mp4"],
3+
"ids": []
4+
}

0 commit comments

Comments
 (0)