-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild-search-index.mjs
More file actions
49 lines (36 loc) · 1.3 KB
/
build-search-index.mjs
File metadata and controls
49 lines (36 loc) · 1.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env node
/**
* Builds a MiniSearch full-text search index from all HTML files in the tree.
* Outputs search-index.json which is loaded by index.html at runtime.
*/
import { readFileSync, writeFileSync } from "node:fs";
import { glob } from "node:fs/promises";
import * as cheerio from "cheerio";
import MiniSearch from "minisearch";
const ROOT = process.cwd();
const EXCLUDED = new Set(["index.html", "sitemap.html"]);
const documents = [];
let id = 0;
for await (const file of glob("**/*.html", { cwd: ROOT })) {
if (EXCLUDED.has(file)) continue;
const html = readFileSync(file, "utf8");
const $ = cheerio.load(html);
const title =
$(".page-title").first().text().trim() ||
$("title").first().text().trim() ||
$("h1").first().text().trim() ||
file;
// Extract visible body text, stripped of HTML
// Remove script and style elements first
$("script, style").remove();
const text = $("body").text().replace(/\s+/g, " ").trim();
documents.push({ id: id++, title, text, url: file });
}
console.log(`Indexed ${documents.length} documents`);
const miniSearch = new MiniSearch({
fields: ["title", "text"],
storeFields: ["title", "url"],
});
miniSearch.addAll(documents);
writeFileSync("search-index.json", JSON.stringify(miniSearch));
console.log("Wrote search-index.json");