@@ -132,7 +132,7 @@ async function extractContentFromFile(
132132
133133 try {
134134 const { text } = await generateText ( {
135- model : openai ( 'gpt-5-mini' ) ,
135+ model : openai ( 'gpt-5.1 -mini' ) ,
136136 messages : [
137137 {
138138 role : 'user' ,
@@ -407,18 +407,28 @@ async function parseChunkQuestionsAndAnswers(chunk: string, chunkIndex: number,
407407 } ,
408408 required : [ 'questionsAndAnswers' ] ,
409409 } ) ,
410- system : `Extract question-answer pairs from vendor questionnaires. Return structured pairs. Use null for missing answers.` ,
410+ system : `You parse vendor questionnaires. Return only genuine question text paired with its answer.
411+ - Ignore table headers, column labels, metadata rows, or placeholder words such as "Question", "Company Name", "Department", "Assessment Date", "Name of Assessor".
412+ - A valid question is a meaningful sentence (usually ends with '?' or starts with interrogatives like What/Why/How/When/Where/Is/Are/Do/Does/Can/Will/Should).
413+ - Do not fabricate answers; if no answer is provided, set answer to null.
414+ - Keep the original question wording but trim whitespace.` ,
411415 prompt : totalChunks > 1
412- ? `Extract question-answer pairs from chunk ${ chunkIndex + 1 } of ${ totalChunks } :
416+ ? `Chunk ${ chunkIndex + 1 } of ${ totalChunks } .
417+ Instructions:
418+ - Extract only question → answer pairs that represent real questions.
419+ - Ignore rows or cells that contain only headers/labels (e.g. "Company Name", "Department", "Assessment Date", "Question", "Answer") or other metadata.
420+ - If an answer is blank, set it to null.
413421
414- ${ chunk }
422+ Chunk content:
423+ ${ chunk } `
424+ : `Instructions:
425+ - Extract all meaningful question → answer pairs from the following content.
426+ - Ignore rows or cells that contain only headers/labels (e.g. "Company Name", "Department", "Assessment Date", "Question", "Answer", "Name of Assessor").
427+ - Keep only entries that are actual questions (end with '?' or start with interrogative words).
428+ - If an answer is blank, set it to null.
415429
416- Return all question-answer pairs found in this chunk.`
417- : `Extract all question-answer pairs from:
418-
419- ${ chunk }
420-
421- Return a structured list of questions and their corresponding answers.` ,
430+ Content:
431+ ${ chunk } `,
422432 } ) ;
423433
424434 const parsed = ( object as { questionsAndAnswers : QuestionAnswer [ ] } ) . questionsAndAnswers ;
@@ -435,82 +445,58 @@ Return a structured list of questions and their corresponding answers.`,
435445 * Optimized to handle large content by chunking and processing in parallel
436446 */
437447async function parseQuestionsAndAnswers ( content : string ) : Promise < QuestionAnswer [ ] > {
438- // GPT-5-mini can handle ~128k tokens, chunk at 100k tokens for efficiency
439- // 1 token ≈ 4 characters, so 100k tokens ≈ 400k characters
440- const MAX_CHUNK_SIZE_CHARS = 400_000 ; // Increased for fewer API calls
441- const MIN_CHUNK_SIZE_CHARS = 10_000 ; // Don't chunk if content is small
442-
443- // If content is small, process directly
444- if ( content . length <= MIN_CHUNK_SIZE_CHARS ) {
445- logger . info ( 'Processing content directly (small size)' , {
446- contentLength : content . length ,
448+ // GPT-5-mini can handle ~128k tokens. Chunk by question count + char limit for efficiency.
449+ const MAX_CHUNK_SIZE_CHARS = 80_000 ;
450+ const MIN_CHUNK_SIZE_CHARS = 5_000 ;
451+ const MAX_QUESTIONS_PER_CHUNK = 35 ;
452+
453+ const chunkInfos = buildQuestionAwareChunks ( content , {
454+ maxChunkChars : MAX_CHUNK_SIZE_CHARS ,
455+ minChunkChars : MIN_CHUNK_SIZE_CHARS ,
456+ maxQuestionsPerChunk : MAX_QUESTIONS_PER_CHUNK ,
457+ } ) ;
458+
459+ if ( chunkInfos . length === 0 ) {
460+ logger . warn ( 'No content found after preprocessing, returning empty result' ) ;
461+ return [ ] ;
462+ }
463+
464+ if ( chunkInfos . length === 1 ) {
465+ logger . info ( 'Processing content as a single chunk' , {
466+ contentLength : chunkInfos [ 0 ] . content . length ,
467+ estimatedQuestions : chunkInfos [ 0 ] . questionCount ,
447468 } ) ;
448- return parseChunkQuestionsAndAnswers ( content , 0 , 1 ) ;
469+ return parseChunkQuestionsAndAnswers ( chunkInfos [ 0 ] . content , 0 , 1 ) ;
449470 }
450-
451- // Chunk large content
452- logger . info ( 'Chunking large content for parallel processing' , {
471+
472+ const totalEstimatedQuestions = chunkInfos . reduce (
473+ ( sum , chunk ) => sum + chunk . questionCount ,
474+ 0 ,
475+ ) ;
476+
477+ logger . info ( 'Chunking content by question count for parallel processing' , {
453478 contentLength : content . length ,
454- estimatedChunks : Math . ceil ( content . length / MAX_CHUNK_SIZE_CHARS ) ,
479+ totalChunks : chunkInfos . length ,
480+ avgQuestionsPerChunk : Number (
481+ ( totalEstimatedQuestions / chunkInfos . length || 0 ) . toFixed ( 2 ) ,
482+ ) ,
455483 } ) ;
456484
457- const chunks : string [ ] = [ ] ;
458- let start = 0 ;
459-
460- while ( start < content . length ) {
461- const end = Math . min ( start + MAX_CHUNK_SIZE_CHARS , content . length ) ;
462- let chunk = content . slice ( start , end ) ;
463-
464- // Try to break at smart boundaries for better context
465- // Prefer breaking after question marks (preserves Q&A pairs)
466- if ( end < content . length && chunk . length > MAX_CHUNK_SIZE_CHARS * 0.8 ) {
467- let breakPoint = - 1 ;
468-
469- // First try: break after question mark (best for Q&A content)
470- const lastQuestionMark = chunk . lastIndexOf ( '?' ) ;
471- if ( lastQuestionMark > MAX_CHUNK_SIZE_CHARS * 0.7 ) {
472- // Find end of line after question mark
473- const afterQuestion = chunk . indexOf ( '\n' , lastQuestionMark ) ;
474- breakPoint = afterQuestion !== - 1 ? afterQuestion + 1 : lastQuestionMark + 1 ;
475- }
476-
477- // Fallback: break at paragraph boundaries
478- if ( breakPoint === - 1 ) {
479- const lastDoubleNewline = chunk . lastIndexOf ( '\n\n' ) ;
480- const lastSingleNewline = chunk . lastIndexOf ( '\n' ) ;
481- breakPoint = Math . max ( lastDoubleNewline , lastSingleNewline ) ;
482- }
483-
484- if ( breakPoint > MAX_CHUNK_SIZE_CHARS * 0.7 ) {
485- chunk = chunk . slice ( 0 , breakPoint + 1 ) ;
486- }
487- }
488-
489- if ( chunk . trim ( ) . length > 0 ) {
490- chunks . push ( chunk . trim ( ) ) ;
491- }
492-
493- start = end ;
494- }
495-
496- logger . info ( 'Content chunked, processing in parallel' , {
497- totalChunks : chunks . length ,
498- } ) ;
499-
500- // Process ALL chunks in parallel for maximum speed
501- // GPT-5-mini has high rate limits and is faster, so we can process all at once
485+ // Process all chunks in parallel for maximum speed
502486 const parseStartTime = Date . now ( ) ;
503- const allPromises = chunks . map ( ( chunk , index ) =>
504- parseChunkQuestionsAndAnswers ( chunk , index , chunks . length ) ,
487+ const allPromises = chunkInfos . map ( ( chunk , index ) =>
488+ parseChunkQuestionsAndAnswers ( chunk . content , index , chunkInfos . length ) ,
505489 ) ;
506490
507491 const allResults = await Promise . all ( allPromises ) ;
508492 const parseTime = ( ( Date . now ( ) - parseStartTime ) / 1000 ) . toFixed ( 2 ) ;
509493
494+ const totalRawQuestions = allResults . reduce ( ( sum , chunk ) => sum + chunk . length , 0 ) ;
495+
510496 logger . info ( 'All chunks processed in parallel' , {
511- totalChunks : chunks . length ,
497+ totalChunks : chunkInfos . length ,
512498 parseTimeSeconds : parseTime ,
513- totalQuestions : allResults . flat ( ) . length ,
499+ totalQuestions : totalRawQuestions ,
514500 } ) ;
515501
516502 // Deduplicate questions (same question might appear in multiple chunks)
@@ -531,12 +517,122 @@ async function parseQuestionsAndAnswers(content: string): Promise<QuestionAnswer
531517
532518 logger . info ( 'Parsing complete' , {
533519 totalQuestions : uniqueResults . length ,
534- duplicatesRemoved : allResults . length - uniqueResults . length ,
520+ duplicatesRemoved : totalRawQuestions - uniqueResults . length ,
535521 } ) ;
536522
537523 return uniqueResults ;
538524}
539525
526+ interface ChunkInfo {
527+ content : string ;
528+ questionCount : number ;
529+ }
530+
531+ function buildQuestionAwareChunks (
532+ content : string ,
533+ options : {
534+ maxChunkChars : number ;
535+ minChunkChars : number ;
536+ maxQuestionsPerChunk : number ;
537+ } ,
538+ ) : ChunkInfo [ ] {
539+ const trimmedContent = content . trim ( ) ;
540+ if ( ! trimmedContent ) {
541+ return [ ] ;
542+ }
543+
544+ if ( trimmedContent . length <= options . minChunkChars ) {
545+ return [
546+ {
547+ content : trimmedContent ,
548+ questionCount : estimateQuestionCount ( trimmedContent ) ,
549+ } ,
550+ ] ;
551+ }
552+
553+ const chunks : ChunkInfo [ ] = [ ] ;
554+ const lines = trimmedContent . split ( / \r ? \n / ) ;
555+ let buffer : string [ ] = [ ] ;
556+ let bufferCharCount = 0 ;
557+ let bufferQuestionCount = 0 ;
558+
559+ const pushChunk = ( ) => {
560+ const chunkText = buffer . join ( '\n' ) . trim ( ) ;
561+ if ( ! chunkText ) {
562+ return ;
563+ }
564+ chunks . push ( {
565+ content : chunkText ,
566+ questionCount : bufferQuestionCount || estimateQuestionCount ( chunkText ) ,
567+ } ) ;
568+ buffer = [ ] ;
569+ bufferCharCount = 0 ;
570+ bufferQuestionCount = 0 ;
571+ } ;
572+
573+ for ( const line of lines ) {
574+ const originalLine = line ;
575+ const trimmedLine = line . trim ( ) ;
576+ const isEmpty = trimmedLine . length === 0 ;
577+ const looksLikeQuestion = ! isEmpty && looksLikeQuestionLine ( trimmedLine ) ;
578+
579+ const exceedsCharBudget =
580+ bufferCharCount + originalLine . length > options . maxChunkChars ;
581+ const exceedsQuestionBudget =
582+ bufferQuestionCount >= options . maxQuestionsPerChunk ;
583+
584+ if ( ( exceedsCharBudget || ( exceedsQuestionBudget && looksLikeQuestion ) ) && buffer . length ) {
585+ pushChunk ( ) ;
586+ }
587+
588+ if ( ! isEmpty || buffer . length ) {
589+ buffer . push ( originalLine ) ;
590+ bufferCharCount += originalLine . length + 1 ;
591+ }
592+
593+ if ( looksLikeQuestion ) {
594+ bufferQuestionCount += 1 ;
595+ }
596+ }
597+
598+ pushChunk ( ) ;
599+
600+ return chunks . length > 0
601+ ? chunks
602+ : [
603+ {
604+ content : trimmedContent ,
605+ questionCount : estimateQuestionCount ( trimmedContent ) ,
606+ } ,
607+ ] ;
608+ }
609+
610+ function looksLikeQuestionLine ( line : string ) : boolean {
611+ const questionSuffix = / [ ? ? ] \s * $ / ;
612+ const explicitQuestionPrefix = / ^ (?: \d + \s * [ \) . \] ] \s * ) ? (?: q u e s t i o n | q ) \b / i;
613+ const interrogativePrefix =
614+ / ^ (?: w h a t | w h y | h o w | w h e n | w h e r e | i s | a r e | d o e s | d o | c a n | w i l l | s h o u l d | l i s t | d e s c r i b e | e x p l a i n ) \b / i;
615+
616+ return (
617+ questionSuffix . test ( line ) ||
618+ explicitQuestionPrefix . test ( line ) ||
619+ interrogativePrefix . test ( line )
620+ ) ;
621+ }
622+
623+ function estimateQuestionCount ( text : string ) : number {
624+ const questionMarks = text . match ( / [ ? ? ] / g) ?. length ?? 0 ;
625+ if ( questionMarks > 0 ) {
626+ return questionMarks ;
627+ }
628+ const lines = text . split ( / \r ? \n / ) . filter ( ( line ) => looksLikeQuestionLine ( line . trim ( ) ) ) ;
629+ if ( lines . length > 0 ) {
630+ return lines . length ;
631+ }
632+ // Fallback heuristic: assume roughly one question per 1200 chars
633+ return Math . max ( 1 , Math . floor ( text . length / 1200 ) ) ;
634+ }
635+
540636export const parseQuestionnaireTask = task ( {
541637 id : 'parse-questionnaire' ,
542638 machine : 'large-2x' ,
0 commit comments