-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdeduplicate-questions.js
More file actions
108 lines (86 loc) · 3.37 KB
/
deduplicate-questions.js
File metadata and controls
108 lines (86 loc) · 3.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
require('dotenv').config();
const fs = require('fs').promises;
function cosineSimilarity(vec1, vec2) {
let dotProduct = 0;
let norm1 = 0;
let norm2 = 0;
for (let i = 0; i < vec1.length; i++) {
dotProduct += vec1[i] * vec2[i];
norm1 += vec1[i] * vec1[i];
norm2 += vec2[i] * vec2[i];
}
return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
}
async function deduplicateQuestions() {
const inputFile = 'questions-with-embeddings.json';
const outputFile = 'question-groups-2.json';
const similarityThreshold = 0.7;
try {
const fileContent = await fs.readFile(inputFile, 'utf8');
const questions = JSON.parse(fileContent);
console.log(`Total questions to process: ${questions.length}`);
console.log(`Similarity threshold: ${(similarityThreshold * 100).toFixed(0)}%`);
const questionGroups = [];
const processedIds = new Set();
for (let i = 0; i < questions.length; i++) {
if (processedIds.has(questions[i].question_id)) {
continue;
}
const group = {
group_id: questions[i].question_id,
questions: [{
question_id: questions[i].question_id,
question: questions[i].question,
message_id: questions[i].message_id
}]
};
processedIds.add(questions[i].question_id);
for (let j = i + 1; j < questions.length; j++) {
if (processedIds.has(questions[j].question_id)) {
continue;
}
const similarity = cosineSimilarity(
questions[i].embedding,
questions[j].embedding
);
if (similarity >= similarityThreshold) {
group.questions.push({
question_id: questions[j].question_id,
question: questions[j].question,
message_id: questions[j].message_id
});
processedIds.add(questions[j].question_id);
}
}
questionGroups.push(group);
// Print group ID if group has more than 1 question
if (group.questions.length > 1) {
console.log(`Group ${group.group_id}: ${group.questions.length} similar questions`);
}
if ((i + 1) % 100 === 0) {
console.log(`Processed ${i + 1} questions, found ${questionGroups.length} groups so far`);
// Save progress
await fs.writeFile(outputFile, JSON.stringify(questionGroups, null, 2));
}
}
// Final save
await fs.writeFile(outputFile, JSON.stringify(questionGroups, null, 2));
// Calculate statistics
const totalQuestions = questions.length;
const totalGroups = questionGroups.length;
const duplicatesFound = totalQuestions - totalGroups;
const largestGroup = Math.max(...questionGroups.map(g => g.questions.length));
const averageGroupSize = (totalQuestions / totalGroups).toFixed(2);
console.log(`\n✓ Deduplication complete!`);
console.log(`✓ Original questions: ${totalQuestions}`);
console.log(`✓ Question groups: ${totalGroups}`);
console.log(`✓ Duplicates found: ${duplicatesFound}`);
console.log(`✓ Largest group: ${largestGroup} questions`);
console.log(`✓ Average group size: ${averageGroupSize} questions`);
console.log(`✓ Results saved to: ${outputFile}`);
} catch (error) {
console.error('Error:', error);
process.exit(1);
}
}
deduplicateQuestions();