Skip to content

Commit 7696cd0

Browse files
committed
refactor(parse-questionnaire): enhance chunk processing and question extraction logic
1 parent 43e2de1 commit 7696cd0

File tree

1 file changed

+170
-74
lines changed

1 file changed

+170
-74
lines changed

apps/app/src/jobs/tasks/vendors/parse-questionnaire.ts

Lines changed: 170 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ async function extractContentFromFile(
132132

133133
try {
134134
const { text } = await generateText({
135-
model: openai('gpt-5-mini'),
135+
model: openai('gpt-5.1-mini'),
136136
messages: [
137137
{
138138
role: 'user',
@@ -407,18 +407,28 @@ async function parseChunkQuestionsAndAnswers(chunk: string, chunkIndex: number,
407407
},
408408
required: ['questionsAndAnswers'],
409409
}),
410-
system: `Extract question-answer pairs from vendor questionnaires. Return structured pairs. Use null for missing answers.`,
410+
system: `You parse vendor questionnaires. Return only genuine question text paired with its answer.
411+
- Ignore table headers, column labels, metadata rows, or placeholder words such as "Question", "Company Name", "Department", "Assessment Date", "Name of Assessor".
412+
- A valid question is a meaningful sentence (usually ends with '?' or starts with interrogatives like What/Why/How/When/Where/Is/Are/Do/Does/Can/Will/Should).
413+
- Do not fabricate answers; if no answer is provided, set answer to null.
414+
- Keep the original question wording but trim whitespace.`,
411415
prompt: totalChunks > 1
412-
? `Extract question-answer pairs from chunk ${chunkIndex + 1} of ${totalChunks}:
416+
? `Chunk ${chunkIndex + 1} of ${totalChunks}.
417+
Instructions:
418+
- Extract only question → answer pairs that represent real questions.
419+
- Ignore rows or cells that contain only headers/labels (e.g. "Company Name", "Department", "Assessment Date", "Question", "Answer") or other metadata.
420+
- If an answer is blank, set it to null.
413421
414-
${chunk}
422+
Chunk content:
423+
${chunk}`
424+
: `Instructions:
425+
- Extract all meaningful question → answer pairs from the following content.
426+
- Ignore rows or cells that contain only headers/labels (e.g. "Company Name", "Department", "Assessment Date", "Question", "Answer", "Name of Assessor").
427+
- Keep only entries that are actual questions (end with '?' or start with interrogative words).
428+
- If an answer is blank, set it to null.
415429
416-
Return all question-answer pairs found in this chunk.`
417-
: `Extract all question-answer pairs from:
418-
419-
${chunk}
420-
421-
Return a structured list of questions and their corresponding answers.`,
430+
Content:
431+
${chunk}`,
422432
});
423433

424434
const parsed = (object as { questionsAndAnswers: QuestionAnswer[] }).questionsAndAnswers;
@@ -435,82 +445,58 @@ Return a structured list of questions and their corresponding answers.`,
435445
* Optimized to handle large content by chunking and processing in parallel
436446
*/
437447
async function parseQuestionsAndAnswers(content: string): Promise<QuestionAnswer[]> {
438-
// GPT-5-mini can handle ~128k tokens, chunk at 100k tokens for efficiency
439-
// 1 token ≈ 4 characters, so 100k tokens ≈ 400k characters
440-
const MAX_CHUNK_SIZE_CHARS = 400_000; // Increased for fewer API calls
441-
const MIN_CHUNK_SIZE_CHARS = 10_000; // Don't chunk if content is small
442-
443-
// If content is small, process directly
444-
if (content.length <= MIN_CHUNK_SIZE_CHARS) {
445-
logger.info('Processing content directly (small size)', {
446-
contentLength: content.length,
448+
// GPT-5-mini can handle ~128k tokens. Chunk by question count + char limit for efficiency.
449+
const MAX_CHUNK_SIZE_CHARS = 80_000;
450+
const MIN_CHUNK_SIZE_CHARS = 5_000;
451+
const MAX_QUESTIONS_PER_CHUNK = 35;
452+
453+
const chunkInfos = buildQuestionAwareChunks(content, {
454+
maxChunkChars: MAX_CHUNK_SIZE_CHARS,
455+
minChunkChars: MIN_CHUNK_SIZE_CHARS,
456+
maxQuestionsPerChunk: MAX_QUESTIONS_PER_CHUNK,
457+
});
458+
459+
if (chunkInfos.length === 0) {
460+
logger.warn('No content found after preprocessing, returning empty result');
461+
return [];
462+
}
463+
464+
if (chunkInfos.length === 1) {
465+
logger.info('Processing content as a single chunk', {
466+
contentLength: chunkInfos[0].content.length,
467+
estimatedQuestions: chunkInfos[0].questionCount,
447468
});
448-
return parseChunkQuestionsAndAnswers(content, 0, 1);
469+
return parseChunkQuestionsAndAnswers(chunkInfos[0].content, 0, 1);
449470
}
450-
451-
// Chunk large content
452-
logger.info('Chunking large content for parallel processing', {
471+
472+
const totalEstimatedQuestions = chunkInfos.reduce(
473+
(sum, chunk) => sum + chunk.questionCount,
474+
0,
475+
);
476+
477+
logger.info('Chunking content by question count for parallel processing', {
453478
contentLength: content.length,
454-
estimatedChunks: Math.ceil(content.length / MAX_CHUNK_SIZE_CHARS),
479+
totalChunks: chunkInfos.length,
480+
avgQuestionsPerChunk: Number(
481+
(totalEstimatedQuestions / chunkInfos.length || 0).toFixed(2),
482+
),
455483
});
456484

457-
const chunks: string[] = [];
458-
let start = 0;
459-
460-
while (start < content.length) {
461-
const end = Math.min(start + MAX_CHUNK_SIZE_CHARS, content.length);
462-
let chunk = content.slice(start, end);
463-
464-
// Try to break at smart boundaries for better context
465-
// Prefer breaking after question marks (preserves Q&A pairs)
466-
if (end < content.length && chunk.length > MAX_CHUNK_SIZE_CHARS * 0.8) {
467-
let breakPoint = -1;
468-
469-
// First try: break after question mark (best for Q&A content)
470-
const lastQuestionMark = chunk.lastIndexOf('?');
471-
if (lastQuestionMark > MAX_CHUNK_SIZE_CHARS * 0.7) {
472-
// Find end of line after question mark
473-
const afterQuestion = chunk.indexOf('\n', lastQuestionMark);
474-
breakPoint = afterQuestion !== -1 ? afterQuestion + 1 : lastQuestionMark + 1;
475-
}
476-
477-
// Fallback: break at paragraph boundaries
478-
if (breakPoint === -1) {
479-
const lastDoubleNewline = chunk.lastIndexOf('\n\n');
480-
const lastSingleNewline = chunk.lastIndexOf('\n');
481-
breakPoint = Math.max(lastDoubleNewline, lastSingleNewline);
482-
}
483-
484-
if (breakPoint > MAX_CHUNK_SIZE_CHARS * 0.7) {
485-
chunk = chunk.slice(0, breakPoint + 1);
486-
}
487-
}
488-
489-
if (chunk.trim().length > 0) {
490-
chunks.push(chunk.trim());
491-
}
492-
493-
start = end;
494-
}
495-
496-
logger.info('Content chunked, processing in parallel', {
497-
totalChunks: chunks.length,
498-
});
499-
500-
// Process ALL chunks in parallel for maximum speed
501-
// GPT-5-mini has high rate limits and is faster, so we can process all at once
485+
// Process all chunks in parallel for maximum speed
502486
const parseStartTime = Date.now();
503-
const allPromises = chunks.map((chunk, index) =>
504-
parseChunkQuestionsAndAnswers(chunk, index, chunks.length),
487+
const allPromises = chunkInfos.map((chunk, index) =>
488+
parseChunkQuestionsAndAnswers(chunk.content, index, chunkInfos.length),
505489
);
506490

507491
const allResults = await Promise.all(allPromises);
508492
const parseTime = ((Date.now() - parseStartTime) / 1000).toFixed(2);
509493

494+
const totalRawQuestions = allResults.reduce((sum, chunk) => sum + chunk.length, 0);
495+
510496
logger.info('All chunks processed in parallel', {
511-
totalChunks: chunks.length,
497+
totalChunks: chunkInfos.length,
512498
parseTimeSeconds: parseTime,
513-
totalQuestions: allResults.flat().length,
499+
totalQuestions: totalRawQuestions,
514500
});
515501

516502
// Deduplicate questions (same question might appear in multiple chunks)
@@ -531,12 +517,122 @@ async function parseQuestionsAndAnswers(content: string): Promise<QuestionAnswer
531517

532518
logger.info('Parsing complete', {
533519
totalQuestions: uniqueResults.length,
534-
duplicatesRemoved: allResults.length - uniqueResults.length,
520+
duplicatesRemoved: totalRawQuestions - uniqueResults.length,
535521
});
536522

537523
return uniqueResults;
538524
}
539525

526+
interface ChunkInfo {
527+
content: string;
528+
questionCount: number;
529+
}
530+
531+
function buildQuestionAwareChunks(
532+
content: string,
533+
options: {
534+
maxChunkChars: number;
535+
minChunkChars: number;
536+
maxQuestionsPerChunk: number;
537+
},
538+
): ChunkInfo[] {
539+
const trimmedContent = content.trim();
540+
if (!trimmedContent) {
541+
return [];
542+
}
543+
544+
if (trimmedContent.length <= options.minChunkChars) {
545+
return [
546+
{
547+
content: trimmedContent,
548+
questionCount: estimateQuestionCount(trimmedContent),
549+
},
550+
];
551+
}
552+
553+
const chunks: ChunkInfo[] = [];
554+
const lines = trimmedContent.split(/\r?\n/);
555+
let buffer: string[] = [];
556+
let bufferCharCount = 0;
557+
let bufferQuestionCount = 0;
558+
559+
const pushChunk = () => {
560+
const chunkText = buffer.join('\n').trim();
561+
if (!chunkText) {
562+
return;
563+
}
564+
chunks.push({
565+
content: chunkText,
566+
questionCount: bufferQuestionCount || estimateQuestionCount(chunkText),
567+
});
568+
buffer = [];
569+
bufferCharCount = 0;
570+
bufferQuestionCount = 0;
571+
};
572+
573+
for (const line of lines) {
574+
const originalLine = line;
575+
const trimmedLine = line.trim();
576+
const isEmpty = trimmedLine.length === 0;
577+
const looksLikeQuestion = !isEmpty && looksLikeQuestionLine(trimmedLine);
578+
579+
const exceedsCharBudget =
580+
bufferCharCount + originalLine.length > options.maxChunkChars;
581+
const exceedsQuestionBudget =
582+
bufferQuestionCount >= options.maxQuestionsPerChunk;
583+
584+
if ((exceedsCharBudget || (exceedsQuestionBudget && looksLikeQuestion)) && buffer.length) {
585+
pushChunk();
586+
}
587+
588+
if (!isEmpty || buffer.length) {
589+
buffer.push(originalLine);
590+
bufferCharCount += originalLine.length + 1;
591+
}
592+
593+
if (looksLikeQuestion) {
594+
bufferQuestionCount += 1;
595+
}
596+
}
597+
598+
pushChunk();
599+
600+
return chunks.length > 0
601+
? chunks
602+
: [
603+
{
604+
content: trimmedContent,
605+
questionCount: estimateQuestionCount(trimmedContent),
606+
},
607+
];
608+
}
609+
610+
function looksLikeQuestionLine(line: string): boolean {
611+
const questionSuffix = /[?]\s*$/;
612+
const explicitQuestionPrefix = /^(?:\d+\s*[\).\]]\s*)?(?:question|q)\b/i;
613+
const interrogativePrefix =
614+
/^(?:what|why|how|when|where|is|are|does|do|can|will|should|list|describe|explain)\b/i;
615+
616+
return (
617+
questionSuffix.test(line) ||
618+
explicitQuestionPrefix.test(line) ||
619+
interrogativePrefix.test(line)
620+
);
621+
}
622+
623+
function estimateQuestionCount(text: string): number {
624+
const questionMarks = text.match(/[?]/g)?.length ?? 0;
625+
if (questionMarks > 0) {
626+
return questionMarks;
627+
}
628+
const lines = text.split(/\r?\n/).filter((line) => looksLikeQuestionLine(line.trim()));
629+
if (lines.length > 0) {
630+
return lines.length;
631+
}
632+
// Fallback heuristic: assume roughly one question per 1200 chars
633+
return Math.max(1, Math.floor(text.length / 1200));
634+
}
635+
540636
export const parseQuestionnaireTask = task({
541637
id: 'parse-questionnaire',
542638
machine: 'large-2x',

0 commit comments

Comments
 (0)