Skip to content

Commit b13a0cc

Browse files
committed
Refactor document processing to remove file type parameter and enhance file type validation
1 parent 971a5a3 commit b13a0cc

File tree

7 files changed

+40
-79
lines changed

7 files changed

+40
-79
lines changed

backend/src/document-processor/controllers/document-processor.controller.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ export class DocumentProcessorController {
6363
// Process the document
6464
const result = await this.documentProcessorService.processDocument(
6565
file.buffer,
66-
file.mimetype,
6766
effectiveUserId,
6867
);
6968

backend/src/document-processor/services/aws-textract.service.spec.ts

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -173,11 +173,7 @@ describe('AwsTextractService', () => {
173173

174174
describe('extractText', () => {
175175
it('should extract text from an image', async () => {
176-
const result = await service.extractText(
177-
Buffer.from('test image content'),
178-
'image/jpeg',
179-
'user-123',
180-
);
176+
const result = await service.extractText(Buffer.from('test image content'), 'user-123');
181177

182178
expect(result).toBeDefined();
183179
expect(result.rawText).toContain('This is a test medical report');
@@ -188,11 +184,7 @@ describe('AwsTextractService', () => {
188184
});
189185

190186
it('should extract text from a PDF', async () => {
191-
const result = await service.extractText(
192-
Buffer.from('test pdf content'),
193-
'application/pdf',
194-
'user-123',
195-
);
187+
const result = await service.extractText(Buffer.from('test pdf content'), 'user-123');
196188

197189
expect(result).toBeDefined();
198190
expect(result.rawText).toContain('This is a test medical report');
@@ -208,9 +200,9 @@ describe('AwsTextractService', () => {
208200
const userId = 'rate-limited-user';
209201

210202
// Should throw rate limit exception
211-
await expect(
212-
service.extractText(Buffer.from('test content'), 'image/jpeg', userId),
213-
).rejects.toThrow('Too many requests');
203+
await expect(service.extractText(Buffer.from('test content'), userId)).rejects.toThrow(
204+
'Too many requests',
205+
);
214206

215207
// The textract API should not be called
216208
expect(mockTextractSend).not.toHaveBeenCalled();
@@ -222,11 +214,9 @@ describe('AwsTextractService', () => {
222214
const documents = [
223215
{
224216
buffer: Buffer.from('test image 1'),
225-
type: 'image/jpeg',
226217
},
227218
{
228219
buffer: Buffer.from('test image 2'),
229-
type: 'image/png',
230220
},
231221
];
232222

@@ -242,7 +232,6 @@ describe('AwsTextractService', () => {
242232
it('should throw an error if batch size exceeds maximum', async () => {
243233
const documents = Array(11).fill({
244234
buffer: Buffer.from('test image'),
245-
type: 'image/jpeg',
246235
});
247236

248237
await expect(service.processBatch(documents, 'user-123')).rejects.toThrow(

backend/src/document-processor/services/aws-textract.service.ts

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -81,15 +81,10 @@ export class AwsTextractService {
8181
/**
8282
* Extract text from a medical lab report image or PDF
8383
* @param fileBuffer The file buffer containing the image or PDF
84-
* @param fileType The MIME type of the file (e.g., 'image/jpeg', 'application/pdf')
8584
* @param userId The authenticated user's ID for rate limiting
8685
* @returns Extracted text result with structured information
8786
*/
88-
async extractText(
89-
fileBuffer: Buffer,
90-
fileType: string,
91-
userId: string,
92-
): Promise<ExtractedTextResult> {
87+
async extractText(fileBuffer: Buffer, userId: string): Promise<ExtractedTextResult> {
9388
try {
9489
const startTime = Date.now();
9590

@@ -99,17 +94,16 @@ export class AwsTextractService {
9994
}
10095

10196
// 2. Validate file securely
102-
validateFileSecurely(fileBuffer, fileType);
97+
validateFileSecurely(fileBuffer);
10398

10499
// Add diagnostic information about the document being processed
105100
this.logger.debug('Processing document', {
106-
fileType,
107101
fileSize: `${(fileBuffer.length / 1024).toFixed(2)} KB`,
108102
contentHashPrefix: createHash('sha256').update(fileBuffer).digest('hex').substring(0, 10),
109103
});
110104

111105
// 3. Process document
112-
const result = await this.processDocument(fileBuffer, fileType);
106+
const result = await this.processDocument(fileBuffer);
113107

114108
// 4. Calculate processing time
115109
const processingTime = Date.now() - startTime;
@@ -125,7 +119,6 @@ export class AwsTextractService {
125119
// Log error securely without exposing sensitive details
126120
this.logger.error('Error processing document', {
127121
error: error instanceof Error ? error.message : 'Unknown error',
128-
fileType,
129122
timestamp: new Date().toISOString(),
130123
userId: this.hashIdentifier(userId),
131124
});
@@ -143,13 +136,8 @@ export class AwsTextractService {
143136
/**
144137
* Process a document (image or PDF)
145138
*/
146-
private async processDocument(
147-
documentBuffer: Buffer,
148-
documentType: string,
149-
): Promise<ExtractedTextResult> {
150-
this.logger.log(
151-
`Processing ${documentType === 'application/pdf' ? 'PDF document' : 'single image'} with Textract`,
152-
);
139+
private async processDocument(documentBuffer: Buffer): Promise<ExtractedTextResult> {
140+
this.logger.log(`Processing file with Textract`);
153141

154142
// Use Analyze Document API for more comprehensive analysis
155143
const command = new AnalyzeDocumentCommand({
@@ -346,12 +334,12 @@ export class AwsTextractService {
346334

347335
/**
348336
* Process multiple documents in batch
349-
* @param documents Array of document buffers with their types
337+
* @param documents Array of document buffers
350338
* @param userId The authenticated user's ID for rate limiting
351339
* @returns Array of extracted text results
352340
*/
353341
async processBatch(
354-
documents: Array<{ buffer: Buffer; type: string }>,
342+
documents: Array<{ buffer: Buffer }>,
355343
userId: string,
356344
): Promise<ExtractedTextResult[]> {
357345
// Validate batch size
@@ -365,12 +353,11 @@ export class AwsTextractService {
365353

366354
for (const doc of documents) {
367355
try {
368-
const result = await this.extractText(doc.buffer, doc.type, userId);
356+
const result = await this.extractText(doc.buffer, userId);
369357
results.push(result);
370358
} catch (error) {
371359
this.logger.error('Error processing document in batch', {
372360
error: error instanceof Error ? error.message : 'Unknown error',
373-
fileType: doc.type,
374361
fileSize: doc.buffer.length,
375362
});
376363

backend/src/document-processor/services/document-processor.service.spec.ts

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ describe('DocumentProcessorService', () => {
2424
it('should extract text and analyze medical document', async () => {
2525
// Arrange
2626
const fileBuffer = Buffer.from('test');
27-
const fileType = 'application/pdf';
2827
const userId = 'test-user';
2928

3029
const extractedTextResult = {
@@ -67,10 +66,10 @@ describe('DocumentProcessorService', () => {
6766
);
6867

6968
// Act
70-
const result = await testService.processDocument(fileBuffer, fileType, userId);
69+
const result = await testService.processDocument(fileBuffer, userId);
7170

7271
// Assert
73-
expect(testTextractService.extractText).toHaveBeenCalledWith(fileBuffer, fileType, userId);
72+
expect(testTextractService.extractText).toHaveBeenCalledWith(fileBuffer, userId);
7473
expect(testBedrockService.analyzeMedicalDocument).toHaveBeenCalledWith(
7574
extractedTextResult.rawText,
7675
userId,
@@ -83,7 +82,6 @@ describe('DocumentProcessorService', () => {
8382
analysis: medicalAnalysis,
8483
simplifiedExplanation,
8584
processingMetadata: expect.objectContaining({
86-
fileType,
8785
fileSize: fileBuffer.length,
8886
}),
8987
});
@@ -92,7 +90,6 @@ describe('DocumentProcessorService', () => {
9290
it('should throw BadRequestException when text extraction fails', async () => {
9391
// Arrange
9492
const fileBuffer = Buffer.from('test');
95-
const fileType = 'application/pdf';
9693
const userId = 'test-user';
9794

9895
// Create test-specific service with proper mocking
@@ -111,7 +108,7 @@ describe('DocumentProcessorService', () => {
111108
);
112109

113110
// Act & Assert
114-
await expect(testService.processDocument(fileBuffer, fileType, userId)).rejects.toThrow(
111+
await expect(testService.processDocument(fileBuffer, userId)).rejects.toThrow(
115112
BadRequestException,
116113
);
117114
});
@@ -163,7 +160,6 @@ describe('DocumentProcessorService', () => {
163160
simplifiedExplanation: 'Simple explanation for document 1',
164161
processingMetadata: {
165162
processingTimeMs: 100,
166-
fileType: 'application/pdf',
167163
fileSize: 4,
168164
},
169165
};
@@ -190,7 +186,6 @@ describe('DocumentProcessorService', () => {
190186
simplifiedExplanation: 'Simple explanation for document 2',
191187
processingMetadata: {
192188
processingTimeMs: 100,
193-
fileType: 'image/jpeg',
194189
fileSize: 4,
195190
},
196191
};
@@ -204,18 +199,8 @@ describe('DocumentProcessorService', () => {
204199

205200
// Assert
206201
expect(processDocumentSpy).toHaveBeenCalledTimes(2);
207-
expect(processDocumentSpy).toHaveBeenNthCalledWith(
208-
1,
209-
documents[0].buffer,
210-
documents[0].type,
211-
userId,
212-
);
213-
expect(processDocumentSpy).toHaveBeenNthCalledWith(
214-
2,
215-
documents[1].buffer,
216-
documents[1].type,
217-
userId,
218-
);
202+
expect(processDocumentSpy).toHaveBeenNthCalledWith(1, documents[0].buffer, userId);
203+
expect(processDocumentSpy).toHaveBeenNthCalledWith(2, documents[1].buffer, userId);
219204
expect(result).toHaveLength(2);
220205
expect(result[0]).toEqual(mockResult1);
221206
expect(result[1]).toEqual(mockResult2);

backend/src/document-processor/services/document-processor.service.ts

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ export interface ProcessedDocumentResult {
1313
simplifiedExplanation?: string;
1414
processingMetadata: {
1515
processingTimeMs: number;
16-
fileType: string;
1716
fileSize: number;
1817
};
1918
}
@@ -35,26 +34,20 @@ export class DocumentProcessorService {
3534
/**
3635
* Process a medical document by extracting text and performing analysis
3736
* @param fileBuffer The file buffer containing the image or PDF
38-
* @param fileType The MIME type of the file (e.g., 'image/jpeg', 'application/pdf')
3937
* @param userId The authenticated user's ID for rate limiting
4038
* @returns Processed document result with extracted text, analysis, and simplified explanation
4139
*/
42-
async processDocument(
43-
fileBuffer: Buffer,
44-
fileType: string,
45-
userId: string,
46-
): Promise<ProcessedDocumentResult> {
40+
async processDocument(fileBuffer: Buffer, userId: string): Promise<ProcessedDocumentResult> {
4741
try {
4842
const startTime = Date.now();
4943

5044
this.logger.log('Starting document processing', {
51-
fileType,
5245
fileSize: `${(fileBuffer.length / 1024).toFixed(2)} KB`,
5346
userId: this.hashIdentifier(userId),
5447
});
5548

5649
// Step 1: Extract text from document using AWS Textract
57-
const extractedText = await this.textractService.extractText(fileBuffer, fileType, userId);
50+
const extractedText = await this.textractService.extractText(fileBuffer, userId);
5851

5952
this.logger.log('Text extraction completed', {
6053
lineCount: extractedText.lines.length,
@@ -103,15 +96,13 @@ export class DocumentProcessorService {
10396
simplifiedExplanation,
10497
processingMetadata: {
10598
processingTimeMs: processingTime,
106-
fileType,
10799
fileSize: fileBuffer.length,
108100
},
109101
};
110102
} catch (error: unknown) {
111103
// Log error securely without exposing sensitive details
112104
this.logger.error('Error processing document', {
113105
error: error instanceof Error ? error.message : 'Unknown error',
114-
fileType,
115106
timestamp: new Date().toISOString(),
116107
userId: this.hashIdentifier(userId),
117108
});
@@ -146,12 +137,11 @@ export class DocumentProcessorService {
146137

147138
for (const doc of documents) {
148139
try {
149-
const result = await this.processDocument(doc.buffer, doc.type, userId);
140+
const result = await this.processDocument(doc.buffer, userId);
150141
results.push(result);
151142
} catch (error) {
152143
this.logger.error('Error processing document in batch', {
153144
error: error instanceof Error ? error.message : 'Unknown error',
154-
fileType: doc.type,
155145
fileSize: doc.buffer.length,
156146
});
157147

@@ -178,7 +168,6 @@ export class DocumentProcessorService {
178168
simplifiedExplanation: undefined,
179169
processingMetadata: {
180170
processingTimeMs: 0,
181-
fileType: doc.type,
182171
fileSize: doc.buffer.length,
183172
},
184173
});

backend/src/services/README.md

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ export interface ProcessedDocumentResult {
3030
analysis: MedicalDocumentAnalysis;
3131
processingMetadata: {
3232
processingTimeMs: number;
33-
fileType: string;
3433
fileSize: number;
3534
};
3635
}
@@ -133,7 +132,6 @@ curl -X POST \
133132
},
134133
"processingMetadata": {
135134
"processingTimeMs": 2345,
136-
"fileType": "application/pdf",
137135
"fileSize": 12345
138136
}
139137
}
@@ -146,11 +144,10 @@ curl -X POST \
146144
constructor(private readonly documentProcessorService: DocumentProcessorService) {}
147145

148146
// Process a document
149-
async processReport(fileBuffer: Buffer, fileType: string, userId: string) {
147+
async processReport(fileBuffer: Buffer, userId: string) {
150148
try {
151149
const result = await this.documentProcessorService.processDocument(
152150
fileBuffer,
153-
fileType,
154151
userId
155152
);
156153

@@ -179,8 +176,8 @@ The service supports batch processing of multiple documents:
179176
```typescript
180177
const results = await documentProcessorService.processBatch(
181178
[
182-
{ buffer: fileBuffer1, type: fileType1 },
183-
{ buffer: fileBuffer2, type: fileType2 }
179+
{ buffer: fileBuffer1 },
180+
{ buffer: fileBuffer2 }
184181
],
185182
userId
186183
);

backend/src/utils/security.utils.ts

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,23 @@ const validateFileType = (buffer: Buffer, mimeType: string): boolean => {
8686
* @param buffer The file buffer to validate
8787
* @param mimeType The declared MIME type of the file
8888
*/
89-
export const validateFileSecurely = (buffer: Buffer, mimeType: string): void => {
89+
export const validateFileSecurely = (buffer: Buffer): void => {
9090
const logger = new Logger('SecurityUtils');
91+
// get file mime type correctly
92+
let mimeType = buffer.toString('hex', 0, 4).toUpperCase();
93+
if (mimeType.startsWith('FFD8')) {
94+
mimeType = 'image/jpeg';
95+
} else if (mimeType.startsWith('89504E47')) {
96+
mimeType = 'image/png';
97+
} else if (mimeType.startsWith('00000020667479706865696300')) {
98+
mimeType = 'image/heic';
99+
} else if (mimeType.startsWith('0000001C667479706D696631')) {
100+
mimeType = 'image/heif';
101+
} else if (mimeType.startsWith('25504446')) {
102+
mimeType = 'application/pdf';
103+
} else {
104+
throw new BadRequestException('Unsupported file type');
105+
}
91106

92107
// 1. Check if file type is allowed
93108
if (!ALLOWED_MIME_TYPES.has(mimeType)) {

0 commit comments

Comments
 (0)