Skip to content

Commit b07f552

Browse files
committed
Refactor AwsTextractService to remove metadata handling and simplify response parsing
- Eliminated metadata properties such as documentType, pageCount, and isLabReport from the ExtractedTextResult interface in backend/src/services/aws-textract.service.ts. - Updated the parseTextractResponse method to no longer require pageCount as a parameter and removed related logic for determining document type and lab report status. - Adjusted unit tests in backend/src/services/aws-textract.service.spec.ts to reflect the removal of metadata checks, ensuring tests focus on essential response validation.
1 parent 3d427bb commit b07f552

File tree

2 files changed

+3
-146
lines changed

2 files changed

+3
-146
lines changed

backend/src/services/aws-textract.service.spec.ts

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -197,8 +197,6 @@ describe('AwsTextractService', () => {
197197
expect(result.lines.length).toBeGreaterThan(0);
198198
expect(result.tables.length).toBeGreaterThan(0);
199199
expect(result.keyValuePairs.length).toBeGreaterThan(0);
200-
expect(result.metadata.documentType).toBe('lab_report');
201-
expect(result.metadata.isLabReport).toBe(true);
202200
expect(mockTextractSend).toHaveBeenCalled();
203201
});
204202

@@ -212,7 +210,6 @@ describe('AwsTextractService', () => {
212210
expect(result).toBeDefined();
213211
expect(result.rawText).toContain('This is a test medical report');
214212
expect(result.lines.length).toBeGreaterThan(0);
215-
expect(result.metadata.pageCount).toBe(1);
216213
expect(mockTextractSend).toHaveBeenCalled();
217214
});
218215
});

backend/src/services/aws-textract.service.ts

Lines changed: 3 additions & 143 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,6 @@ export interface ExtractedTextResult {
1414
key: string;
1515
value: string;
1616
}>;
17-
metadata: {
18-
documentType: string;
19-
pageCount: number;
20-
isLabReport: boolean;
21-
confidence: number;
22-
processingTimeMs: number;
23-
};
2417
}
2518

2619
/**
@@ -119,12 +112,8 @@ export class AwsTextractService {
119112

120113
// 5. Calculate processing time
121114
const processingTime = Date.now() - startTime;
122-
result.metadata.processingTimeMs = processingTime;
123115

124116
this.logger.log(`Document processed in ${processingTime}ms`, {
125-
documentType: result.metadata.documentType,
126-
pageCount: result.metadata.pageCount,
127-
isLabReport: result.metadata.isLabReport,
128117
lineCount: result.lines.length,
129118
tableCount: result.tables.length,
130119
keyValuePairCount: result.keyValuePairs.length,
@@ -166,7 +155,7 @@ export class AwsTextractService {
166155

167156
const response = await this.client.send(command);
168157

169-
return this.parseTextractResponse(response, 1);
158+
return this.parseTextractResponse(response);
170159
}
171160

172161
/**
@@ -189,17 +178,13 @@ export class AwsTextractService {
189178

190179
const response = await this.client.send(command);
191180

192-
// A real implementation would count pages in the PDF
193-
// This example processes just one page for simplicity
194-
const estimatedPageCount = 1;
195-
196-
return this.parseTextractResponse(response, estimatedPageCount);
181+
return this.parseTextractResponse(response);
197182
}
198183

199184
/**
200185
* Parse the response from AWS Textract into a structured result
201186
*/
202-
private parseTextractResponse(response: any, pageCount: number): ExtractedTextResult {
187+
private parseTextractResponse(response: any): ExtractedTextResult {
203188
if (!response || !response.Blocks || response.Blocks.length === 0) {
204189
throw new Error('Empty response from Textract');
205190
}
@@ -210,13 +195,6 @@ export class AwsTextractService {
210195
lines: [],
211196
tables: [],
212197
keyValuePairs: [],
213-
metadata: {
214-
documentType: this.determineDocumentType(response.Blocks),
215-
pageCount: pageCount,
216-
isLabReport: false, // Will be set later based on content analysis
217-
confidence: this.calculateOverallConfidence(response.Blocks),
218-
processingTimeMs: 0, // Will be set later
219-
},
220198
};
221199

222200
// Extract lines of text
@@ -232,9 +210,6 @@ export class AwsTextractService {
232210
// Extract key-value pairs from FORM analysis
233211
result.keyValuePairs = this.extractKeyValuePairs(response.Blocks);
234212

235-
// Determine if it's a lab report based on content
236-
result.metadata.isLabReport = this.isLabReport(result);
237-
238213
return result;
239214
}
240215

@@ -379,114 +354,6 @@ export class AwsTextractService {
379354
return wordBlocks.map(block => block.Text || '').join(' ');
380355
}
381356

382-
/**
383-
* Calculate overall confidence score from blocks
384-
*/
385-
private calculateOverallConfidence(blocks: Block[]): number {
386-
if (!blocks || blocks.length === 0) {
387-
return 0;
388-
}
389-
390-
const confidenceValues = blocks
391-
.filter(block => block.Confidence !== undefined)
392-
.map(block => block.Confidence || 0);
393-
394-
if (confidenceValues.length === 0) {
395-
return 0;
396-
}
397-
398-
const avgConfidence =
399-
confidenceValues.reduce((sum, val) => sum + val, 0) / confidenceValues.length;
400-
return Number((avgConfidence / 100).toFixed(2)); // Convert to 0-1 scale and limit decimal places
401-
}
402-
403-
/**
404-
* Determine the type of document based on content
405-
*/
406-
private determineDocumentType(blocks: Block[]): string {
407-
// Extract all text
408-
const allText = blocks
409-
.filter(block => block.BlockType === 'LINE')
410-
.map(block => block.Text || '')
411-
.join(' ')
412-
.toLowerCase();
413-
414-
// Check for lab report keywords
415-
if (
416-
allText.includes('lab') ||
417-
allText.includes('laboratory') ||
418-
allText.includes('test results') ||
419-
allText.includes('blood') ||
420-
allText.includes('specimen')
421-
) {
422-
return 'lab_report';
423-
}
424-
425-
// Check for medical report keywords
426-
if (
427-
allText.includes('diagnosis') ||
428-
allText.includes('patient') ||
429-
allText.includes('medical') ||
430-
allText.includes('doctor') ||
431-
allText.includes('hospital')
432-
) {
433-
return 'medical_report';
434-
}
435-
436-
// Default
437-
return 'general_document';
438-
}
439-
440-
/**
441-
* Check if document is likely a lab report based on content
442-
*/
443-
private isLabReport(result: ExtractedTextResult): boolean {
444-
// Check document type
445-
if (result.metadata.documentType === 'lab_report') {
446-
return true;
447-
}
448-
449-
// Check for common lab report terms
450-
const labReportTerms = [
451-
'cbc',
452-
'complete blood count',
453-
'hemoglobin',
454-
'wbc',
455-
'rbc',
456-
'platelet',
457-
'glucose',
458-
'cholesterol',
459-
'hdl',
460-
'ldl',
461-
'triglycerides',
462-
'creatinine',
463-
'bun',
464-
'alt',
465-
'ast',
466-
'reference range',
467-
'normal range',
468-
'lab',
469-
'test results',
470-
];
471-
472-
const lowerText = result.rawText.toLowerCase();
473-
474-
// Count how many lab terms appear in the text
475-
const termMatches = labReportTerms.filter(term => lowerText.includes(term)).length;
476-
477-
// If we have tables and at least 2 lab terms, it's likely a lab report
478-
if (result.tables.length > 0 && termMatches >= 2) {
479-
return true;
480-
}
481-
482-
// If we have more than 3 lab terms, it's likely a lab report even without tables
483-
if (termMatches >= 3) {
484-
return true;
485-
}
486-
487-
return false;
488-
}
489-
490357
/**
491358
* Hash a string identifier for logging purposes
492359
*/
@@ -530,13 +397,6 @@ export class AwsTextractService {
530397
lines: [],
531398
tables: [],
532399
keyValuePairs: [],
533-
metadata: {
534-
documentType: 'unknown',
535-
pageCount: 0,
536-
isLabReport: false,
537-
confidence: 0,
538-
processingTimeMs: 0,
539-
},
540400
});
541401
}
542402
}

0 commit comments

Comments
 (0)