|
| 1 | +# Document Processor Service |
| 2 | + |
| 3 | +This service integrates AWS Textract for text extraction and AWS Bedrock for medical analysis to process medical documents. |
| 4 | + |
| 5 | +## Overview |
| 6 | + |
| 7 | +The Document Processor Service provides a unified interface for processing medical documents through a two-step approach: |
| 8 | + |
| 9 | +1. Extract text from medical documents (images or PDFs) using AWS Textract |
| 10 | +2. Analyze the extracted text using AWS Bedrock (Claude) to provide structured medical information |
| 11 | + |
| 12 | +## Components |
| 13 | + |
| 14 | +The integration consists of the following components: |
| 15 | + |
| 16 | +1. **DocumentProcessorService**: Main service that orchestrates the document processing workflow |
| 17 | +2. **AwsTextractService**: Extracts text, tables, and form data from medical documents |
| 18 | +3. **AwsBedrockService**: Analyzes medical text using Claude model to extract structured information |
| 19 | +4. **DocumentProcessorController**: Exposes HTTP endpoints for document upload and processing |
| 20 | + |
| 21 | +## Data Models |
| 22 | + |
| 23 | +### ProcessedDocumentResult |
| 24 | + |
| 25 | +The result of document processing includes: |
| 26 | + |
| 27 | +```typescript |
| 28 | +export interface ProcessedDocumentResult { |
| 29 | + extractedText: ExtractedTextResult; |
| 30 | + analysis: MedicalDocumentAnalysis; |
| 31 | + processingMetadata: { |
| 32 | + processingTimeMs: number; |
| 33 | + fileType: string; |
| 34 | + fileSize: number; |
| 35 | + }; |
| 36 | +} |
| 37 | +``` |
| 38 | + |
| 39 | +### ExtractedTextResult |
| 40 | + |
| 41 | +The raw text extraction from Textract: |
| 42 | + |
| 43 | +```typescript |
| 44 | +export interface ExtractedTextResult { |
| 45 | + rawText: string; |
| 46 | + lines: string[]; |
| 47 | + tables: Array<{ |
| 48 | + rows: string[][]; |
| 49 | + }>; |
| 50 | + keyValuePairs: Array<{ |
| 51 | + key: string; |
| 52 | + value: string; |
| 53 | + }>; |
| 54 | +} |
| 55 | +``` |
| 56 | + |
| 57 | +### MedicalDocumentAnalysis |
| 58 | + |
| 59 | +The structured medical information from Bedrock: |
| 60 | + |
| 61 | +```typescript |
| 62 | +export interface MedicalDocumentAnalysis { |
| 63 | + keyMedicalTerms: Array<{ term: string; definition: string }>; |
| 64 | + labValues: Array<{ |
| 65 | + name: string; |
| 66 | + value: string; |
| 67 | + unit: string; |
| 68 | + normalRange: string; |
| 69 | + isAbnormal: boolean; |
| 70 | + }>; |
| 71 | + diagnoses: Array<{ condition: string; details: string; recommendations: string }>; |
| 72 | + metadata: { |
| 73 | + isMedicalReport: boolean; |
| 74 | + confidence: number; |
| 75 | + missingInformation: string[]; |
| 76 | + }; |
| 77 | +} |
| 78 | +``` |
| 79 | + |
| 80 | +## API Endpoints |
| 81 | + |
| 82 | +### Process a Document |
| 83 | + |
| 84 | +``` |
| 85 | +POST /api/document-processor/analyze |
| 86 | +``` |
| 87 | + |
| 88 | +**Request Format:** |
| 89 | +- Content-Type: `multipart/form-data` |
| 90 | +- Body: Form with a file upload field named `file` |
| 91 | +- Authorization: Bearer token required |
| 92 | + |
| 93 | +**Example Request:** |
| 94 | +```bash |
| 95 | +curl -X POST \ |
| 96 | + "http://localhost:3000/api/document-processor/analyze" \ |
| 97 | + -H "Authorization: Bearer YOUR_JWT_TOKEN" \ |
| 98 | + -H "Content-Type: multipart/form-data" \ |
| 99 | + -F "file=@/path/to/medical_report.pdf" |
| 100 | +``` |
| 101 | + |
| 102 | +**Response:** |
| 103 | +```json |
| 104 | +{ |
| 105 | + "extractedText": { |
| 106 | + "rawText": "BLOOD TEST RESULTS\nPatient: John Doe\nHemoglobin: 14.2 g/dL (Normal: 13.5-17.5)", |
| 107 | + "lines": ["BLOOD TEST RESULTS", "Patient: John Doe", "Hemoglobin: 14.2 g/dL (Normal: 13.5-17.5)"], |
| 108 | + "tables": [], |
| 109 | + "keyValuePairs": [ |
| 110 | + { "key": "Patient", "value": "John Doe" }, |
| 111 | + { "key": "Hemoglobin", "value": "14.2 g/dL (Normal: 13.5-17.5)" } |
| 112 | + ] |
| 113 | + }, |
| 114 | + "analysis": { |
| 115 | + "keyMedicalTerms": [ |
| 116 | + { "term": "Hemoglobin", "definition": "Oxygen-carrying protein in red blood cells" } |
| 117 | + ], |
| 118 | + "labValues": [ |
| 119 | + { |
| 120 | + "name": "Hemoglobin", |
| 121 | + "value": "14.2", |
| 122 | + "unit": "g/dL", |
| 123 | + "normalRange": "13.5-17.5", |
| 124 | + "isAbnormal": false |
| 125 | + } |
| 126 | + ], |
| 127 | + "diagnoses": [], |
| 128 | + "metadata": { |
| 129 | + "isMedicalReport": true, |
| 130 | + "confidence": 0.95, |
| 131 | + "missingInformation": [] |
| 132 | + } |
| 133 | + }, |
| 134 | + "processingMetadata": { |
| 135 | + "processingTimeMs": 2345, |
| 136 | + "fileType": "application/pdf", |
| 137 | + "fileSize": 12345 |
| 138 | + } |
| 139 | +} |
| 140 | +``` |
| 141 | + |
| 142 | +## Usage from Code |
| 143 | + |
| 144 | +```typescript |
| 145 | +// Inject the service |
| 146 | +constructor(private readonly documentProcessorService: DocumentProcessorService) {} |
| 147 | + |
| 148 | +// Process a document |
| 149 | +async processReport(fileBuffer: Buffer, fileType: string, userId: string) { |
| 150 | + try { |
| 151 | + const result = await this.documentProcessorService.processDocument( |
| 152 | + fileBuffer, |
| 153 | + fileType, |
| 154 | + userId |
| 155 | + ); |
| 156 | + |
| 157 | + // Use the structured medical data |
| 158 | + const labValues = result.analysis.labValues; |
| 159 | + const abnormalValues = labValues.filter(lab => lab.isAbnormal); |
| 160 | + |
| 161 | + return result; |
| 162 | + } catch (error) { |
| 163 | + console.error('Error processing medical document:', error); |
| 164 | + throw error; |
| 165 | + } |
| 166 | +} |
| 167 | +``` |
| 168 | + |
| 169 | +## Rate Limiting |
| 170 | + |
| 171 | +Both services implement rate limiting based on user ID: |
| 172 | +- AWS Textract: 10 document requests per minute by default (configurable) |
| 173 | +- AWS Bedrock: 20 model invocations per minute by default (configurable) |
| 174 | + |
| 175 | +## Batch Processing |
| 176 | + |
| 177 | +The service supports batch processing of multiple documents: |
| 178 | + |
| 179 | +```typescript |
| 180 | +const results = await documentProcessorService.processBatch( |
| 181 | + [ |
| 182 | + { buffer: fileBuffer1, type: fileType1 }, |
| 183 | + { buffer: fileBuffer2, type: fileType2 } |
| 184 | + ], |
| 185 | + userId |
| 186 | +); |
| 187 | +``` |
| 188 | + |
| 189 | +## Configuration |
| 190 | + |
| 191 | +Configure the services through environment variables: |
| 192 | + |
| 193 | +```bash |
| 194 | +# AWS Region |
| 195 | +AWS_REGION=us-east-1 |
| 196 | + |
| 197 | +# AWS Credentials (if not using IAM roles) |
| 198 | +AWS_ACCESS_KEY_ID=your-access-key |
| 199 | +AWS_SECRET_ACCESS_KEY=your-secret-key |
| 200 | + |
| 201 | +# AWS Bedrock |
| 202 | +AWS_BEDROCK_MODEL=us.anthropic.claude-3-7-sonnet-20250219-v1:0 |
| 203 | +AWS_BEDROCK_MAX_TOKENS=2048 |
| 204 | +AWS_BEDROCK_REQUESTS_PER_MINUTE=20 |
| 205 | + |
| 206 | +# AWS Textract |
| 207 | +AWS_TEXTRACT_MAX_BATCH_SIZE=10 |
| 208 | +AWS_TEXTRACT_DOCS_PER_MINUTE=10 |
| 209 | +``` |
| 210 | + |
| 211 | +## Future Enhancements |
| 212 | + |
| 213 | +Planned future enhancements: |
| 214 | +- Support for multi-page PDF processing using async APIs |
| 215 | +- Enhanced lab report detection and categorization |
| 216 | +- Integration with medical terminology databases |
| 217 | +- OCR preprocessing for low-quality images |
0 commit comments