Skip to content

Commit c7466fb

Browse files
authored
Merge pull request #188 from wpengine/remove-elastic-search-duplicates
Remove elastic search duplicates
2 parents 8766b0d + 084eac3 commit c7466fb

File tree

2 files changed

+186
-94
lines changed

2 files changed

+186
-94
lines changed

src/lib/smart-search-plugin.mjs

Lines changed: 125 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -4,26 +4,31 @@ import { cwd } from "node:process";
44
import { htmlToText } from "html-to-text";
55

66
function smartSearchPlugin({ endpoint, accessToken }) {
7+
let isPluginExecuted = false;
8+
79
return {
810
apply: (compiler) => {
911
compiler.hooks.done.tapPromise("SmartSearchPlugin", async () => {
12+
if (isPluginExecuted) {
13+
return;
14+
}
15+
16+
isPluginExecuted = true;
17+
18+
if (compiler.options.mode !== "production") {
19+
console.log("Skipping indexing in non-production mode.");
20+
return;
21+
}
22+
1023
try {
1124
const pages = await collectPages(path.join(cwd(), "src/pages/docs"));
1225

13-
pages.push({
14-
id: "test-document",
15-
data: {
16-
title: "Test Document",
17-
content: "This is a test document for indexing.",
18-
path: "/test-path",
19-
},
20-
});
21-
2226
console.log("Docs Pages collected for indexing:", pages.length);
2327

28+
await deleteExistingDocs(endpoint, accessToken);
2429
await sendPagesToEndpoint(pages, endpoint, accessToken);
2530
} catch (error) {
26-
console.error("Error sending pages:", error);
31+
console.error("Error in smartSearchPlugin:", error);
2732
}
2833
});
2934
},
@@ -32,45 +37,54 @@ function smartSearchPlugin({ endpoint, accessToken }) {
3237

3338
async function collectPages(directory) {
3439
const pages = [];
35-
const files = await fs.readdir(directory);
40+
const entries = await fs.readdir(directory, { withFileTypes: true });
3641

37-
for (const file of files) {
38-
const filePath = path.join(directory, file);
39-
const stat = await fs.stat(filePath);
42+
for (const entry of entries) {
43+
const entryPath = path.join(directory, entry.name);
4044

41-
if (stat.isDirectory()) {
42-
const subPages = await collectPages(filePath);
45+
if (entry.isDirectory()) {
46+
const subPages = await collectPages(entryPath);
4347
pages.push(...subPages);
44-
} else if (file.endsWith(".mdx")) {
45-
const content = await fs.readFile(filePath, "utf8");
48+
} else if (entry.isFile() && entry.name.endsWith(".mdx")) {
49+
const content = await fs.readFile(entryPath, "utf8");
4650

47-
// Safely extract metadata using regex
4851
const metadataMatch = content.match(
49-
/export const metadata = (?<metadata>{[\S\s]+?});/,
52+
/export\s+const\s+metadata\s*=\s*(?<metadata>{[\S\s]*?});/,
5053
);
54+
5155
let metadata = {};
5256

53-
if (metadataMatch) {
57+
if (
58+
metadataMatch &&
59+
metadataMatch.groups &&
60+
metadataMatch.groups.metadata
61+
) {
5462
try {
55-
// eslint-disable-next-line no-eval
56-
metadata = eval(`(${metadataMatch.groups.metadata})`); // Parse the metadata block
63+
metadata = eval(`(${metadataMatch.groups.metadata})`);
5764
} catch (error) {
5865
console.error("Error parsing metadata:", error);
66+
continue;
5967
}
68+
} else {
69+
console.warn(`No metadata found in ${entryPath}. Skipping.`);
70+
continue;
6071
}
6172

6273
const textContent = htmlToText(content);
63-
const id = filePath
64-
.replace(cwd(), "")
65-
.replaceAll("/", "-")
66-
.replace(".mdx", "");
74+
75+
const cleanedPath = cleanPath(entryPath);
76+
77+
const id = `mdx:${cleanedPath}`;
78+
79+
console.log(`Indexing document with ID: ${id}, path: ${cleanedPath}`);
6780

6881
pages.push({
6982
id,
7083
data: {
71-
title: metadata.title || undefined, // No fallback to "Untitled Document"
84+
title: metadata.title,
7285
content: textContent,
73-
path: filePath.replace(cwd(), ""),
86+
path: cleanedPath,
87+
content_type: "mdx_doc",
7488
},
7589
});
7690
}
@@ -79,13 +93,61 @@ async function collectPages(directory) {
7993
return pages;
8094
}
8195

82-
const query = `
83-
mutation CreateIndexDocument($input: DocumentInput!) {
84-
index(input: $input) {
85-
success
96+
function cleanPath(filePath) {
97+
const relativePath = path.relative(cwd(), filePath);
98+
return (
99+
"/" +
100+
relativePath
101+
.replace(/^src\/pages\//, "")
102+
.replace(/^pages\//, "")
103+
.replace(/\/index\.mdx$/, "")
104+
.replace(/\.mdx$/, "")
105+
);
106+
}
107+
108+
async function deleteExistingDocs(endpoint, accessToken) {
109+
const variables = {
110+
filter: {
111+
content_type: "mdx_doc",
112+
},
113+
};
114+
115+
const deleteQuery = `
116+
mutation DeleteDocs($filter: DocumentFilterInput) {
117+
deleteMany(filter: $filter) {
118+
code
119+
message
120+
success
121+
}
122+
}
123+
`;
124+
125+
try {
126+
const response = await fetch(endpoint, {
127+
method: "POST",
128+
headers: {
129+
"Content-Type": "application/json",
130+
Authorization: `Bearer ${accessToken}`,
131+
},
132+
body: JSON.stringify({ query: deleteQuery, variables }),
133+
});
134+
135+
const result = await response.json();
136+
if (result.errors) {
137+
console.error("GraphQL deletion error:", result.errors);
138+
} else {
139+
console.log("Existing MDX documents deleted:", result.data.deleteMany);
140+
}
141+
} catch (error) {
142+
console.error("Error deleting existing documents:", error);
143+
}
144+
}
145+
146+
const bulkIndexQuery = `
147+
mutation BulkIndex($documents: [DocumentInput!]!) {
148+
bulkIndex(input: { documents: $documents }) {
86149
code
87-
message
88-
document {
150+
documents {
89151
id
90152
data
91153
}
@@ -99,40 +161,39 @@ async function sendPagesToEndpoint(pages, endpoint, accessToken) {
99161
return;
100162
}
101163

102-
for (const page of pages) {
103-
const documentId = `mdx:${page.id}`;
104-
const variables = {
105-
input: {
106-
id: documentId,
107-
data: {
108-
content: page.data.content,
109-
path: page.data.path,
110-
title: page.data.title || undefined, // No fallback to "Untitled Document"
111-
},
164+
const documents = pages.map((page) => ({
165+
id: page.id,
166+
data: page.data,
167+
}));
168+
169+
const variables = { documents };
170+
171+
try {
172+
const response = await fetch(endpoint, {
173+
method: "POST",
174+
headers: {
175+
"Content-Type": "application/json",
176+
Authorization: `Bearer ${accessToken}`,
112177
},
113-
};
114-
115-
try {
116-
const response = await fetch(endpoint, {
117-
method: "POST",
118-
headers: {
119-
"Content-Type": "application/json",
120-
Authorization: `Bearer ${accessToken}`,
121-
},
122-
body: JSON.stringify({ query, variables }),
123-
});
178+
body: JSON.stringify({ query: bulkIndexQuery, variables }),
179+
});
124180

125-
const result = await response.json();
126-
if (result.errors) {
127-
console.error("GraphQL indexing error:", result.errors);
128-
}
129-
} catch (error) {
181+
if (!response.ok) {
130182
console.error(
131-
"Error indexing document:",
132-
page.data.title || "No title",
133-
error,
183+
`Error during bulk indexing: ${response.status} ${response.statusText}`,
134184
);
185+
return;
186+
}
187+
188+
const result = await response.json();
189+
190+
if (result.errors) {
191+
console.error("GraphQL bulk indexing error:", result.errors);
192+
} else {
193+
console.log(`Indexed ${documents.length} documents successfully.`);
135194
}
195+
} catch (error) {
196+
console.error("Error during bulk indexing:", error);
136197
}
137198
}
138199

src/pages/api/search.js

Lines changed: 61 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
import process from "node:process";
22

3-
// Example input: /src/pages/docs/how-to/authentication/index.mdx
4-
// Example output: /docs/how-to/authentication
5-
const generateDocPath = (filePath) =>
6-
filePath.replace(/^\/src\/pages/, "").replace(/\/index\.mdx$/, "");
3+
function cleanPath(filePath) {
4+
return (
5+
filePath
6+
.replace(/^\/?src\/pages/, "")
7+
.replace(/^\/?pages/, "")
8+
.replace(/\/index\.mdx$/, "")
9+
.replace(/\.mdx$/, "") || "/"
10+
);
11+
}
712

813
export default async function handler(req, res) {
914
const endpoint = process.env.NEXT_PUBLIC_SEARCH_ENDPOINT;
@@ -15,16 +20,16 @@ export default async function handler(req, res) {
1520
}
1621

1722
const graphqlQuery = `
18-
query FindDocuments($query: String!) {
19-
find(query: $query) {
20-
total
21-
documents {
22-
id
23-
data
24-
}
23+
query FindDocuments($query: String!) {
24+
find(query: $query) {
25+
total
26+
documents {
27+
id
28+
data
2529
}
2630
}
27-
`;
31+
}
32+
`;
2833

2934
try {
3035
const response = await fetch(endpoint, {
@@ -42,33 +47,59 @@ export default async function handler(req, res) {
4247
const result = await response.json();
4348

4449
if (result.errors) {
45-
console.error("Elasticsearch errors:", result.errors);
50+
console.error("Search errors:", result.errors);
4651
return res.status(500).json({ errors: result.errors });
4752
}
4853

49-
const formattedResults = result.data.find.documents.map((content) => {
50-
const contentType = content.data.post_type ?? "doc";
54+
const formattedResults = result.data.find.documents
55+
.map((content) => {
56+
const contentType = content.data.content_type || content.data.post_type;
57+
let item; // Initialize the variable to hold the result
58+
59+
if (contentType === "mdx_doc" && content.data.title) {
60+
// MDX Document
61+
const path = content.data.path ? cleanPath(content.data.path) : "/";
62+
63+
item = {
64+
id: content.id,
65+
title: content.data.title,
66+
path,
67+
type: "mdx_doc",
68+
};
69+
} else if (
70+
(contentType === "wp_post" || contentType === "post") &&
71+
content.data.post_title &&
72+
content.data.post_name
73+
) {
74+
// WordPress Post
75+
item = {
76+
id: content.id,
77+
title: content.data.post_title,
78+
path: `/blog/${content.data.post_name}`,
79+
type: "post",
80+
};
81+
} else {
82+
item = undefined;
83+
}
84+
85+
return item;
86+
})
87+
.filter((item) => item !== undefined);
5188

52-
if (contentType === "doc") {
53-
return {
54-
id: content.id,
55-
title: content.data.title || "Untitled",
56-
path: content.data.path ? generateDocPath(content.data.path) : "#",
57-
type: contentType,
58-
};
89+
// Remove duplicates based on ID
90+
const seenIds = new Set();
91+
const uniqueResults = formattedResults.filter((item) => {
92+
if (seenIds.has(item.id)) {
93+
return false; // Skip if already in the Set
5994
}
6095

61-
return {
62-
id: content.id,
63-
title: content.data.post_title || "Untitled",
64-
path: `/blog/${content.data.post_name}`,
65-
type: contentType,
66-
};
96+
seenIds.add(item.id); // Add new ID to the Set
97+
return true; // Keep this item
6798
});
6899

69-
return res.status(200).json(formattedResults);
100+
return res.status(200).json(uniqueResults);
70101
} catch (error) {
71-
console.error("Error fetching MDX data:", error);
102+
console.error("Error fetching search data:", error);
72103
return res.status(500).json({ error: error.message });
73104
}
74105
}

0 commit comments

Comments
 (0)