Skip to content

Commit cbc1b59

Browse files
committed
Add DocumentProcessorModule and DocumentProcessorService for medical document processing
- Introduced DocumentProcessorModule in backend/src/modules/document-processor.module.ts to encapsulate the document processing logic. - Implemented DocumentProcessorService in backend/src/services/document-processor.service.ts, integrating AWS Textract for text extraction and AWS Bedrock for medical analysis. - Added unit tests for DocumentProcessorService in backend/src/services/document-processor.service.spec.ts to ensure functionality and error handling. - Updated app.module.ts to include DocumentProcessorModule, enhancing the application's capability to process medical documents efficiently.
1 parent 9f941d8 commit cbc1b59

File tree

7 files changed

+635
-3
lines changed

7 files changed

+635
-3
lines changed

backend/src/app.module.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import { ReportsModule } from './reports/reports.module';
1212
import { HealthController } from './health/health.controller';
1313
import { AuthMiddleware } from './auth/auth.middleware';
1414
import { TextractModule } from './modules/textract.module';
15+
import { DocumentProcessorModule } from './modules/document-processor.module';
1516

1617
@Module({
1718
imports: [
@@ -21,6 +22,7 @@ import { TextractModule } from './modules/textract.module';
2122
}),
2223
ReportsModule,
2324
TextractModule,
25+
DocumentProcessorModule,
2426
],
2527
controllers: [AppController, HealthController, PerplexityController, UserController],
2628
providers: [AppService, AwsSecretsService, AwsBedrockService, PerplexityService],
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import { Module } from '@nestjs/common';
2+
import { DocumentProcessorService } from '../services/document-processor.service';
3+
import { ConfigModule } from '@nestjs/config';
4+
import { AwsTextractService } from '../services/aws-textract.service';
5+
import { AwsBedrockService } from '../services/aws-bedrock.service';
6+
7+
@Module({
8+
imports: [ConfigModule],
9+
controllers: [],
10+
providers: [DocumentProcessorService, AwsTextractService, AwsBedrockService],
11+
exports: [DocumentProcessorService],
12+
})
13+
export class DocumentProcessorModule {}

backend/src/services/README.md

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
# Document Processor Service
2+
3+
This service integrates AWS Textract for text extraction and AWS Bedrock for medical analysis to process medical documents.
4+
5+
## Overview
6+
7+
The Document Processor Service provides a unified interface for processing medical documents through a two-step approach:
8+
9+
1. Extract text from medical documents (images or PDFs) using AWS Textract
10+
2. Analyze the extracted text using AWS Bedrock (Claude) to provide structured medical information
11+
12+
## Components
13+
14+
The integration consists of the following components:
15+
16+
1. **DocumentProcessorService**: Main service that orchestrates the document processing workflow
17+
2. **AwsTextractService**: Extracts text, tables, and form data from medical documents
18+
3. **AwsBedrockService**: Analyzes medical text using Claude model to extract structured information
19+
4. **DocumentProcessorController**: Exposes HTTP endpoints for document upload and processing
20+
21+
## Data Models
22+
23+
### ProcessedDocumentResult
24+
25+
The result of document processing includes:
26+
27+
```typescript
28+
export interface ProcessedDocumentResult {
29+
extractedText: ExtractedTextResult;
30+
analysis: MedicalDocumentAnalysis;
31+
processingMetadata: {
32+
processingTimeMs: number;
33+
fileType: string;
34+
fileSize: number;
35+
};
36+
}
37+
```
38+
39+
### ExtractedTextResult
40+
41+
The raw text extraction from Textract:
42+
43+
```typescript
44+
export interface ExtractedTextResult {
45+
rawText: string;
46+
lines: string[];
47+
tables: Array<{
48+
rows: string[][];
49+
}>;
50+
keyValuePairs: Array<{
51+
key: string;
52+
value: string;
53+
}>;
54+
}
55+
```
56+
57+
### MedicalDocumentAnalysis
58+
59+
The structured medical information from Bedrock:
60+
61+
```typescript
62+
export interface MedicalDocumentAnalysis {
63+
keyMedicalTerms: Array<{ term: string; definition: string }>;
64+
labValues: Array<{
65+
name: string;
66+
value: string;
67+
unit: string;
68+
normalRange: string;
69+
isAbnormal: boolean;
70+
}>;
71+
diagnoses: Array<{ condition: string; details: string; recommendations: string }>;
72+
metadata: {
73+
isMedicalReport: boolean;
74+
confidence: number;
75+
missingInformation: string[];
76+
};
77+
}
78+
```
79+
80+
## API Endpoints
81+
82+
### Process a Document
83+
84+
```
85+
POST /api/document-processor/analyze
86+
```
87+
88+
**Request Format:**
89+
- Content-Type: `multipart/form-data`
90+
- Body: Form with a file upload field named `file`
91+
- Authorization: Bearer token required
92+
93+
**Example Request:**
94+
```bash
95+
curl -X POST \
96+
"http://localhost:3000/api/document-processor/analyze" \
97+
-H "Authorization: Bearer YOUR_JWT_TOKEN" \
98+
-H "Content-Type: multipart/form-data" \
99+
-F "file=@/path/to/medical_report.pdf"
100+
```
101+
102+
**Response:**
103+
```json
104+
{
105+
"extractedText": {
106+
"rawText": "BLOOD TEST RESULTS\nPatient: John Doe\nHemoglobin: 14.2 g/dL (Normal: 13.5-17.5)",
107+
"lines": ["BLOOD TEST RESULTS", "Patient: John Doe", "Hemoglobin: 14.2 g/dL (Normal: 13.5-17.5)"],
108+
"tables": [],
109+
"keyValuePairs": [
110+
{ "key": "Patient", "value": "John Doe" },
111+
{ "key": "Hemoglobin", "value": "14.2 g/dL (Normal: 13.5-17.5)" }
112+
]
113+
},
114+
"analysis": {
115+
"keyMedicalTerms": [
116+
{ "term": "Hemoglobin", "definition": "Oxygen-carrying protein in red blood cells" }
117+
],
118+
"labValues": [
119+
{
120+
"name": "Hemoglobin",
121+
"value": "14.2",
122+
"unit": "g/dL",
123+
"normalRange": "13.5-17.5",
124+
"isAbnormal": false
125+
}
126+
],
127+
"diagnoses": [],
128+
"metadata": {
129+
"isMedicalReport": true,
130+
"confidence": 0.95,
131+
"missingInformation": []
132+
}
133+
},
134+
"processingMetadata": {
135+
"processingTimeMs": 2345,
136+
"fileType": "application/pdf",
137+
"fileSize": 12345
138+
}
139+
}
140+
```
141+
142+
## Usage from Code
143+
144+
```typescript
145+
// Inject the service
146+
constructor(private readonly documentProcessorService: DocumentProcessorService) {}
147+
148+
// Process a document
149+
async processReport(fileBuffer: Buffer, fileType: string, userId: string) {
150+
try {
151+
const result = await this.documentProcessorService.processDocument(
152+
fileBuffer,
153+
fileType,
154+
userId
155+
);
156+
157+
// Use the structured medical data
158+
const labValues = result.analysis.labValues;
159+
const abnormalValues = labValues.filter(lab => lab.isAbnormal);
160+
161+
return result;
162+
} catch (error) {
163+
console.error('Error processing medical document:', error);
164+
throw error;
165+
}
166+
}
167+
```
168+
169+
## Rate Limiting
170+
171+
Both services implement rate limiting based on user ID:
172+
- AWS Textract: 10 document requests per minute by default (configurable)
173+
- AWS Bedrock: 20 model invocations per minute by default (configurable)
174+
175+
## Batch Processing
176+
177+
The service supports batch processing of multiple documents:
178+
179+
```typescript
180+
const results = await documentProcessorService.processBatch(
181+
[
182+
{ buffer: fileBuffer1, type: fileType1 },
183+
{ buffer: fileBuffer2, type: fileType2 }
184+
],
185+
userId
186+
);
187+
```
188+
189+
## Configuration
190+
191+
Configure the services through environment variables:
192+
193+
```bash
194+
# AWS Region
195+
AWS_REGION=us-east-1
196+
197+
# AWS Credentials (if not using IAM roles)
198+
AWS_ACCESS_KEY_ID=your-access-key
199+
AWS_SECRET_ACCESS_KEY=your-secret-key
200+
201+
# AWS Bedrock
202+
AWS_BEDROCK_MODEL=us.anthropic.claude-3-7-sonnet-20250219-v1:0
203+
AWS_BEDROCK_MAX_TOKENS=2048
204+
AWS_BEDROCK_REQUESTS_PER_MINUTE=20
205+
206+
# AWS Textract
207+
AWS_TEXTRACT_MAX_BATCH_SIZE=10
208+
AWS_TEXTRACT_DOCS_PER_MINUTE=10
209+
```
210+
211+
## Future Enhancements
212+
213+
Planned future enhancements:
214+
- Support for multi-page PDF processing using async APIs
215+
- Enhanced lab report detection and categorization
216+
- Integration with medical terminology databases
217+
- OCR preprocessing for low-quality images

backend/src/services/aws-bedrock.service.spec.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ describe('AwsBedrockService', () => {
215215

216216
// Verify the invokeBedrock was called with the correct prompt
217217
expect(service['invokeBedrock']).toHaveBeenCalled();
218-
const prompt = (service['invokeBedrock'] as jest.Mock).mock.calls[0][0];
218+
const prompt = (service['invokeBedrock'] as any).mock.calls[0][0];
219219
expect(prompt).toContain('Please analyze this medical document carefully');
220220
});
221221

@@ -238,7 +238,7 @@ describe('AwsBedrockService', () => {
238238

239239
// Verify the invokeBedrock was called with the correct prompt
240240
expect(service['invokeBedrock']).toHaveBeenCalled();
241-
const prompt = (service['invokeBedrock'] as jest.Mock).mock.calls[0][0];
241+
const prompt = (service['invokeBedrock'] as any).mock.calls[0][0];
242242
expect(prompt).toContain('Please analyze this medical document carefully');
243243
});
244244

backend/src/services/aws-textract.service.spec.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ describe('AwsTextractService', () => {
202202

203203
it('should handle rate limiting by user ID', async () => {
204204
// Mock rate limiter to reject the request
205-
(service['rateLimiter'].tryRequest as jest.Mock).mockReturnValueOnce(false);
205+
(service['rateLimiter'].tryRequest as any).mockReturnValueOnce(false);
206206

207207
// Use a test user ID
208208
const userId = 'rate-limited-user';

0 commit comments

Comments
 (0)