Skip to content

Commit 3203e61

Browse files
authored
index international text, ignore html entities (#825)
* index international text, ignore html entities * normalize after entities are decoded
1 parent fba1589 commit 3203e61

File tree

5 files changed

+23
-8
lines changed

5 files changed

+23
-8
lines changed

src/client/search.js

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,12 @@ const index = await fetch(import.meta.resolve("./minisearch.json"))
1616
.then((json) =>
1717
MiniSearch.loadJS(json, {
1818
...json.options,
19-
processTerm: (term) => term.slice(0, 15).toLowerCase() // see src/minisearch.json.ts
19+
processTerm: (term) =>
20+
term
21+
.slice(0, 15)
22+
.normalize("NFD")
23+
.replace(/[\u0300-\u036f]/g, "")
24+
.toLowerCase() // see src/minisearch.json.ts
2025
})
2126
);
2227

src/search.ts

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import {basename, join} from "node:path";
2+
import he from "he";
23
import MiniSearch from "minisearch";
34
import type {Config} from "./config.js";
45
import {visitMarkdownFiles} from "./files.js";
@@ -20,7 +21,7 @@ const indexOptions = {
2021
fields: ["title", "text"],
2122
storeFields: ["title"],
2223
processTerm(term) {
23-
return term.match(/\d/g)?.length > 6 ? null : term.slice(0, 15).toLowerCase(); // fields to return with search results
24+
return term.match(/\p{N}/gu)?.length > 6 ? null : term.slice(0, 15).toLowerCase(); // fields to return with search results
2425
}
2526
};
2627

@@ -55,11 +56,17 @@ export async function searchIndex(config: Config, effects = defaultEffects): Pro
5556
// implicitly a leading slash here.
5657
const id = file.slice(0, basename(file) === "index.md" ? -"index.md".length : -3);
5758

58-
const text = html
59-
.replaceAll(/[\n\r]/g, " ")
60-
.replaceAll(/<style\b.*<\/style\b[^>]*>/gi, " ")
61-
.replaceAll(/<[^>]+>/g, " ")
62-
.replaceAll(/\W+/g, " ");
59+
// eslint-disable-next-line import/no-named-as-default-member
60+
const text = he
61+
.decode(
62+
html
63+
.replaceAll(/[\n\r]/g, " ")
64+
.replaceAll(/<style\b.*<\/style\b[^>]*>/gi, " ")
65+
.replaceAll(/<[^>]+>/g, " ")
66+
)
67+
.normalize("NFD")
68+
.replaceAll(/[\u0300-\u036f]/g, "")
69+
.replace(/[^\p{L}\p{N}]/gu, " "); // keep letters & numbers
6370

6471
effects.logger.log(`${faint("search indexing")} ${path}`);
6572
index.add({id, title, text});

test/input/build/search-public/page1.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ title: page 1
66

77
Text. (And some 🤯 ☔️ emoji)
88

9+
Cél&egrave;bre ! Mañana … Добридень &copy; &lt;&amp;&gt;
10+
911
<style type="text/css">
1012

1113
.this:should(be) {
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"options":{"fields":["title","text"],"storeFields":["title"]},"documentCount":2,"nextId":2,"documentIds":{"0":"page1","1":"sub/page2"},"fieldIds":{"title":0,"text":1},"fieldLength":{"0":[2,7],"1":[2,4]},"averageFieldLength":[2,5.5],"storedFields":{"0":{"title":"page 1"},"1":{"title":"Page 2"}},"dirtCount":0,"index":[["2",{"0":{"1":1},"1":{"1":1}}],["emoji",{"1":{"0":1}}],["some",{"1":{"0":1}}],["and",{"1":{"0":1}}],["text",{"1":{"0":1,"1":1}}],["1",{"0":{"0":1},"1":{"0":1}}],["page",{"0":{"0":1,"1":1},"1":{"0":1,"1":1}}]],"serializationVersion":2}
1+
{"options":{"fields":["title","text"],"storeFields":["title"]},"documentCount":2,"nextId":2,"documentIds":{"0":"page1","1":"sub/page2"},"fieldIds":{"title":0,"text":1},"fieldLength":{"0":[2,10],"1":[2,4]},"averageFieldLength":[2,7],"storedFields":{"0":{"title":"page 1"},"1":{"title":"Page 2"}},"dirtCount":0,"index":[["2",{"0":{"1":1},"1":{"1":1}}],["добридень",{"1":{"0":1}}],["manana",{"1":{"0":1}}],["celebre",{"1":{"0":1}}],["emoji",{"1":{"0":1}}],["some",{"1":{"0":1}}],["and",{"1":{"0":1}}],["text",{"1":{"0":1,"1":1}}],["1",{"0":{"0":1},"1":{"0":1}}],["page",{"0":{"0":1,"1":1},"1":{"0":1,"1":1}}]],"serializationVersion":2}

test/output/build/search-public/page1.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
<main id="observablehq-main" class="observablehq">
4141
<h1 id="page-1" tabindex="-1"><a class="observablehq-header-anchor" href="#page-1">Page 1</a></h1>
4242
<p>Text. (And some 🤯 ☔️ emoji)</p>
43+
<p>Célèbre ! Mañana … Добридень © &lt;&amp;&gt;</p>
4344
<style type="text/css">
4445

4546
.this:should(be) {

0 commit comments

Comments
 (0)