-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgather-context.js
More file actions
110 lines (91 loc) · 3.8 KB
/
gather-context.js
File metadata and controls
110 lines (91 loc) · 3.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
require('dotenv').config();
const fs = require('fs').promises;
const { parse } = require('csv-parse');
async function gatherContext() {
const questionGroupsFile = 'question-groups-2.json';
const messagesFile = 'discord_messages_with_uuid.csv';
const outputFile = 'question-groups-with-context.json';
const contextWindowSize = 10; // Number of messages to gather after the question
try {
// Load question groups
const groupsContent = await fs.readFile(questionGroupsFile, 'utf8');
const questionGroups = JSON.parse(groupsContent);
// Load all Discord messages
const csvContent = await fs.readFile(messagesFile, 'utf8');
const messages = await new Promise((resolve, reject) => {
parse(csvContent, {
columns: true,
skip_empty_lines: true
}, (err, output) => {
if (err) reject(err);
else resolve(output);
});
});
console.log(`Loaded ${questionGroups.length} question groups`);
console.log(`Loaded ${messages.length} Discord messages`);
console.log(`Gathering ${contextWindowSize} messages of context for each question`);
// Create a map for quick message lookup by ID
const messageMap = new Map();
messages.forEach((msg, index) => {
messageMap.set(msg.id, { message: msg, index: index });
});
// Process each question group
const enrichedGroups = [];
for (let groupIndex = 0; groupIndex < questionGroups.length; groupIndex++) {
const group = questionGroups[groupIndex];
const enrichedGroup = {
group_id: group.group_id,
questions_with_context: []
};
// Process each question in the group
for (const question of group.questions) {
const enrichedQuestion = {
question_id: question.question_id,
question: question.question,
message_id: question.message_id,
context_messages: []
};
// Find the original message
const messageData = messageMap.get(question.message_id);
if (messageData) {
const startIndex = messageData.index;
// Gather the next N messages as context
for (let i = startIndex + 1; i < Math.min(startIndex + 1 + contextWindowSize, messages.length); i++) {
const contextMsg = messages[i];
enrichedQuestion.context_messages.push({
id: contextMsg.id,
username: contextMsg.Username,
content: contextMsg.Content,
date: contextMsg.Date
});
}
} else {
console.log(` Warning: Could not find message ${question.message_id}`);
}
enrichedGroup.questions_with_context.push(enrichedQuestion);
}
enrichedGroups.push(enrichedGroup);
if ((groupIndex + 1) % 50 === 0) {
console.log(`Processed ${groupIndex + 1} groups`);
// Save progress
await fs.writeFile(outputFile, JSON.stringify(enrichedGroups, null, 2));
}
}
// Final save
await fs.writeFile(outputFile, JSON.stringify(enrichedGroups, null, 2));
// Calculate statistics
const totalQuestions = enrichedGroups.reduce((sum, g) => sum + g.questions_with_context.length, 0);
const questionsWithContext = enrichedGroups.reduce((sum, g) =>
sum + g.questions_with_context.filter(q => q.context_messages.length > 0).length, 0
);
console.log(`\n✓ Context gathering complete!`);
console.log(`✓ Total groups processed: ${enrichedGroups.length}`);
console.log(`✓ Total questions: ${totalQuestions}`);
console.log(`✓ Questions with context: ${questionsWithContext}`);
console.log(`✓ Results saved to: ${outputFile}`);
} catch (error) {
console.error('Error:', error);
process.exit(1);
}
}
gatherContext();