refactor(jobs): optimize question chunking for parallel processing (#1825)

github-actions[bot] · tofikwest · web-flow · commit 96882feea4bc · 2025-11-24T17:11:12.000-05:00
Co-authored-by: Tofik Hasanov &lt;annexcies@gmail.com&gt;
diff --git a/apps/app/src/jobs/tasks/vendors/parse-questionnaire.ts b/apps/app/src/jobs/tasks/vendors/parse-questionnaire.ts
@@ -456,10 +456,10 @@ ${chunk}`,
  * Optimized to handle large content by chunking and processing in parallel
  */
 async function parseQuestionsAndAnswers(content: string): Promise<QuestionAnswer[]> {
-  // GPT-5-mini can handle ~128k tokens. Chunk by question count + char limit for efficiency.
+  // GPT-5-mini can handle ~128k tokens. Chunk by individual questions (1 question = 1 chunk) for parallel processing.
   const MAX_CHUNK_SIZE_CHARS = 80_000;
   const MIN_CHUNK_SIZE_CHARS = 5_000;
-  const MAX_QUESTIONS_PER_CHUNK = 35;
+  const MAX_QUESTIONS_PER_CHUNK = 1; // Each chunk contains exactly one question
 
   const chunkInfos = buildQuestionAwareChunks(content, {
     maxChunkChars: MAX_CHUNK_SIZE_CHARS,
@@ -482,10 +482,10 @@ async function parseQuestionsAndAnswers(content: string): Promise<QuestionAnswer
 
   const totalEstimatedQuestions = chunkInfos.reduce((sum, chunk) => sum + chunk.questionCount, 0);
 
-  logger.info('Chunking content by question count for parallel processing', {
+  logger.info('Chunking content by individual questions (1 question per chunk) for parallel processing', {
     contentLength: content.length,
     totalChunks: chunkInfos.length,
-    avgQuestionsPerChunk: Number((totalEstimatedQuestions / chunkInfos.length || 0).toFixed(2)),
+    questionsPerChunk: 1, // Each chunk contains exactly one question
   });
 
   // Process all chunks in parallel for maximum speed
@@ -547,60 +547,51 @@ function buildQuestionAwareChunks(
     return [];
   }
 
-  if (trimmedContent.length <= options.minChunkChars) {
-    return [
-      {
-        content: trimmedContent,
-        questionCount: estimateQuestionCount(trimmedContent),
-      },
-    ];
-  }
-
   const chunks: ChunkInfo[] = [];
   const lines = trimmedContent.split(/\r?\n/);
-  let buffer: string[] = [];
-  let bufferCharCount = 0;
-  let bufferQuestionCount = 0;
+  let currentChunk: string[] = [];
+  let currentQuestionFound = false;
 
   const pushChunk = () => {
-    const chunkText = buffer.join('\n').trim();
+    const chunkText = currentChunk.join('\n').trim();
     if (!chunkText) {
       return;
     }
     chunks.push({
       content: chunkText,
-      questionCount: bufferQuestionCount || estimateQuestionCount(chunkText),
+      questionCount: 1, // Each chunk contains exactly one question
     });
-    buffer = [];
-    bufferCharCount = 0;
-    bufferQuestionCount = 0;
+    currentChunk = [];
+    currentQuestionFound = false;
   };
 
   for (const line of lines) {
-    const originalLine = line;
     const trimmedLine = line.trim();
     const isEmpty = trimmedLine.length === 0;
     const looksLikeQuestion = !isEmpty && looksLikeQuestionLine(trimmedLine);
 
-    const exceedsCharBudget = bufferCharCount + originalLine.length > options.maxChunkChars;
-    const exceedsQuestionBudget = bufferQuestionCount >= options.maxQuestionsPerChunk;
-
-    if ((exceedsCharBudget || (exceedsQuestionBudget && looksLikeQuestion)) && buffer.length) {
+    // If we find a new question and we already have a question in the current chunk, start a new chunk
+    if (looksLikeQuestion && currentQuestionFound && currentChunk.length > 0) {
       pushChunk();
     }
 
-    if (!isEmpty || buffer.length) {
-      buffer.push(originalLine);
-      bufferCharCount += originalLine.length + 1;
+    // Add line to current chunk (including empty lines for context)
+    if (!isEmpty || currentChunk.length > 0) {
+      currentChunk.push(line);
     }
 
+    // Mark that we've found a question in this chunk
     if (looksLikeQuestion) {
-      bufferQuestionCount += 1;
+      currentQuestionFound = true;
     }
   }
 
-  pushChunk();
+  // Push the last chunk if it has content
+  if (currentChunk.length > 0) {
+    pushChunk();
+  }
 
+  // If no questions were detected, return the entire content as a single chunk
   return chunks.length > 0
     ? chunks
     : [
diff --git a/apps/app/src/lib/vector/core/count-embeddings.ts b/apps/app/src/lib/vector/core/count-embeddings.ts
@@ -30,7 +30,7 @@ export async function countEmbeddings(
     
     const results = await vectorIndex.query({
       vector: queryEmbedding,
-      topK: 1000, // Max allowed by Upstash Vector
+      topK: 100, // Max allowed by Upstash Vector
       includeMetadata: true,
     });
 
@@ -101,7 +101,7 @@ export async function listManualAnswerEmbeddings(
     
     const results = await vectorIndex.query({
       vector: queryEmbedding,
-      topK: 1000,
+      topK: 100,
       includeMetadata: true,
     });
 
diff --git a/apps/app/src/lib/vector/core/find-existing-embeddings.ts b/apps/app/src/lib/vector/core/find-existing-embeddings.ts
@@ -48,7 +48,7 @@ export async function findEmbeddingsForSource(
       const orgQueryEmbedding = await generateEmbedding(organizationId);
       const orgResults = await vectorIndex.query({
         vector: orgQueryEmbedding,
-        topK: 1000,
+        topK: 100,
         includeMetadata: true,
       });
 
@@ -84,7 +84,7 @@ export async function findEmbeddingsForSource(
       const sourceQueryEmbedding = await generateEmbedding(sourceId);
       const sourceResults = await vectorIndex.query({
         vector: sourceQueryEmbedding,
-        topK: 1000,
+        topK: 100,
         includeMetadata: true,
       });
 
@@ -122,7 +122,7 @@ export async function findEmbeddingsForSource(
       const combinedQueryEmbedding = await generateEmbedding(combinedQuery);
       const combinedResults = await vectorIndex.query({
         vector: combinedQueryEmbedding,
-        topK: 1000,
+        topK: 100,
         includeMetadata: true,
       });
 
@@ -160,7 +160,7 @@ export async function findEmbeddingsForSource(
         const docNameQueryEmbedding = await generateEmbedding(documentName);
         const docNameResults = await vectorIndex.query({
           vector: docNameQueryEmbedding,
-          topK: 1000,
+          topK: 100,
           includeMetadata: true,
         });
 
@@ -221,7 +221,7 @@ export async function findEmbeddingsForSource(
                 const contentQueryEmbedding = await generateEmbedding(contentQuery);
                 const contentResults = await vectorIndex.query({
                   vector: contentQueryEmbedding,
-                  topK: 1000,
+                  topK: 100,
                   includeMetadata: true,
                 });
 
@@ -251,7 +251,7 @@ export async function findEmbeddingsForSource(
                 const filenameQueryEmbedding = await generateEmbedding(chunkDocumentName);
                 const filenameResults = await vectorIndex.query({
                   vector: filenameQueryEmbedding,
-                  topK: 1000,
+                  topK: 100,
                   includeMetadata: true,
                 });
 
@@ -306,7 +306,7 @@ export async function findEmbeddingsForSource(
           const genericQueryEmbedding = await generateEmbedding(genericQuery);
           const genericResults = await vectorIndex.query({
             vector: genericQueryEmbedding,
-            topK: 1000,
+            topK: 100,
             includeMetadata: true,
           });
 
@@ -389,7 +389,7 @@ export async function findAllOrganizationEmbeddings(
     // Respect Upstash Vector limit of 1000
     const results = await vectorIndex.query({
       vector: queryEmbedding,
-      topK: 1000, // Max allowed by Upstash Vector
+      topK: 100, // Max allowed by Upstash Vector
       includeMetadata: true,
     });
 
diff --git a/apps/app/src/lib/vector/core/find-similar.ts b/apps/app/src/lib/vector/core/find-similar.ts
@@ -49,7 +49,7 @@ export async function findSimilarContent(
     // so we'll filter results after retrieval
     const results = await vectorIndex.query({
       vector: queryEmbedding,
-      topK: limit * 2, // Get more results to account for filtering
+      topK: 100, // Get more results to account for filtering
       includeMetadata: true,
     });
 
diff --git a/apps/app/src/lib/vector/sync/sync-organization.ts b/apps/app/src/lib/vector/sync/sync-organization.ts
@@ -259,7 +259,7 @@ async function performSync(organizationId: string): Promise<void> {
               return; // Skip empty context
             }
 
-            const chunks = chunkText(contextText, 500, 50);
+            const chunks = chunkText(contextText, 8000, 50);
             
             if (chunks.length === 0) {
               return; // Skip if no chunks

Original file line number	Diff line number	Diff line change
`@@ -259,7 +259,7 @@ async function performSync(organizationId: string): Promise<void> {`
`259`	`259`	`return; // Skip empty context`
`260`	`260`	`}`
`261`	`261`
`262`		`- const chunks = chunkText(contextText, 500, 50);`
	`262`	`+ const chunks = chunkText(contextText, 8000, 50);`
`263`	`263`
`264`	`264`	`if (chunks.length === 0) {`
`265`	`265`	`return; // Skip if no chunks`