@@ -14,13 +14,6 @@ export interface ExtractedTextResult {
1414 key : string ;
1515 value : string ;
1616 } > ;
17- metadata : {
18- documentType : string ;
19- pageCount : number ;
20- isLabReport : boolean ;
21- confidence : number ;
22- processingTimeMs : number ;
23- } ;
2417}
2518
2619/**
@@ -119,12 +112,8 @@ export class AwsTextractService {
119112
120113 // 5. Calculate processing time
121114 const processingTime = Date . now ( ) - startTime ;
122- result . metadata . processingTimeMs = processingTime ;
123115
124116 this . logger . log ( `Document processed in ${ processingTime } ms` , {
125- documentType : result . metadata . documentType ,
126- pageCount : result . metadata . pageCount ,
127- isLabReport : result . metadata . isLabReport ,
128117 lineCount : result . lines . length ,
129118 tableCount : result . tables . length ,
130119 keyValuePairCount : result . keyValuePairs . length ,
@@ -166,7 +155,7 @@ export class AwsTextractService {
166155
167156 const response = await this . client . send ( command ) ;
168157
169- return this . parseTextractResponse ( response , 1 ) ;
158+ return this . parseTextractResponse ( response ) ;
170159 }
171160
172161 /**
@@ -189,17 +178,13 @@ export class AwsTextractService {
189178
190179 const response = await this . client . send ( command ) ;
191180
192- // A real implementation would count pages in the PDF
193- // This example processes just one page for simplicity
194- const estimatedPageCount = 1 ;
195-
196- return this . parseTextractResponse ( response , estimatedPageCount ) ;
181+ return this . parseTextractResponse ( response ) ;
197182 }
198183
199184 /**
200185 * Parse the response from AWS Textract into a structured result
201186 */
202- private parseTextractResponse ( response : any , pageCount : number ) : ExtractedTextResult {
187+ private parseTextractResponse ( response : any ) : ExtractedTextResult {
203188 if ( ! response || ! response . Blocks || response . Blocks . length === 0 ) {
204189 throw new Error ( 'Empty response from Textract' ) ;
205190 }
@@ -210,13 +195,6 @@ export class AwsTextractService {
210195 lines : [ ] ,
211196 tables : [ ] ,
212197 keyValuePairs : [ ] ,
213- metadata : {
214- documentType : this . determineDocumentType ( response . Blocks ) ,
215- pageCount : pageCount ,
216- isLabReport : false , // Will be set later based on content analysis
217- confidence : this . calculateOverallConfidence ( response . Blocks ) ,
218- processingTimeMs : 0 , // Will be set later
219- } ,
220198 } ;
221199
222200 // Extract lines of text
@@ -232,9 +210,6 @@ export class AwsTextractService {
232210 // Extract key-value pairs from FORM analysis
233211 result . keyValuePairs = this . extractKeyValuePairs ( response . Blocks ) ;
234212
235- // Determine if it's a lab report based on content
236- result . metadata . isLabReport = this . isLabReport ( result ) ;
237-
238213 return result ;
239214 }
240215
@@ -379,114 +354,6 @@ export class AwsTextractService {
379354 return wordBlocks . map ( block => block . Text || '' ) . join ( ' ' ) ;
380355 }
381356
382- /**
383- * Calculate overall confidence score from blocks
384- */
385- private calculateOverallConfidence ( blocks : Block [ ] ) : number {
386- if ( ! blocks || blocks . length === 0 ) {
387- return 0 ;
388- }
389-
390- const confidenceValues = blocks
391- . filter ( block => block . Confidence !== undefined )
392- . map ( block => block . Confidence || 0 ) ;
393-
394- if ( confidenceValues . length === 0 ) {
395- return 0 ;
396- }
397-
398- const avgConfidence =
399- confidenceValues . reduce ( ( sum , val ) => sum + val , 0 ) / confidenceValues . length ;
400- return Number ( ( avgConfidence / 100 ) . toFixed ( 2 ) ) ; // Convert to 0-1 scale and limit decimal places
401- }
402-
403- /**
404- * Determine the type of document based on content
405- */
406- private determineDocumentType ( blocks : Block [ ] ) : string {
407- // Extract all text
408- const allText = blocks
409- . filter ( block => block . BlockType === 'LINE' )
410- . map ( block => block . Text || '' )
411- . join ( ' ' )
412- . toLowerCase ( ) ;
413-
414- // Check for lab report keywords
415- if (
416- allText . includes ( 'lab' ) ||
417- allText . includes ( 'laboratory' ) ||
418- allText . includes ( 'test results' ) ||
419- allText . includes ( 'blood' ) ||
420- allText . includes ( 'specimen' )
421- ) {
422- return 'lab_report' ;
423- }
424-
425- // Check for medical report keywords
426- if (
427- allText . includes ( 'diagnosis' ) ||
428- allText . includes ( 'patient' ) ||
429- allText . includes ( 'medical' ) ||
430- allText . includes ( 'doctor' ) ||
431- allText . includes ( 'hospital' )
432- ) {
433- return 'medical_report' ;
434- }
435-
436- // Default
437- return 'general_document' ;
438- }
439-
440- /**
441- * Check if document is likely a lab report based on content
442- */
443- private isLabReport ( result : ExtractedTextResult ) : boolean {
444- // Check document type
445- if ( result . metadata . documentType === 'lab_report' ) {
446- return true ;
447- }
448-
449- // Check for common lab report terms
450- const labReportTerms = [
451- 'cbc' ,
452- 'complete blood count' ,
453- 'hemoglobin' ,
454- 'wbc' ,
455- 'rbc' ,
456- 'platelet' ,
457- 'glucose' ,
458- 'cholesterol' ,
459- 'hdl' ,
460- 'ldl' ,
461- 'triglycerides' ,
462- 'creatinine' ,
463- 'bun' ,
464- 'alt' ,
465- 'ast' ,
466- 'reference range' ,
467- 'normal range' ,
468- 'lab' ,
469- 'test results' ,
470- ] ;
471-
472- const lowerText = result . rawText . toLowerCase ( ) ;
473-
474- // Count how many lab terms appear in the text
475- const termMatches = labReportTerms . filter ( term => lowerText . includes ( term ) ) . length ;
476-
477- // If we have tables and at least 2 lab terms, it's likely a lab report
478- if ( result . tables . length > 0 && termMatches >= 2 ) {
479- return true ;
480- }
481-
482- // If we have more than 3 lab terms, it's likely a lab report even without tables
483- if ( termMatches >= 3 ) {
484- return true ;
485- }
486-
487- return false ;
488- }
489-
490357 /**
491358 * Hash a string identifier for logging purposes
492359 */
@@ -530,13 +397,6 @@ export class AwsTextractService {
530397 lines : [ ] ,
531398 tables : [ ] ,
532399 keyValuePairs : [ ] ,
533- metadata : {
534- documentType : 'unknown' ,
535- pageCount : 0 ,
536- isLabReport : false ,
537- confidence : 0 ,
538- processingTimeMs : 0 ,
539- } ,
540400 } ) ;
541401 }
542402 }
0 commit comments