Skip to content

Commit 8871f07

Browse files
committed
Refactor AwsTextractService to streamline document processing logic
- Consolidated PDF and image processing into a single method, processDocument, in backend/src/services/aws-textract.service.ts for improved maintainability. - Updated logging to differentiate between PDF and image processing within the new method. - Removed redundant code related to separate processing methods for images and PDFs, enhancing code clarity.
1 parent cbc1b59 commit 8871f07

File tree

1 file changed

+12
-39
lines changed

1 file changed

+12
-39
lines changed

backend/src/services/aws-textract.service.ts

Lines changed: 12 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -108,19 +108,10 @@ export class AwsTextractService {
108108
contentHashPrefix: createHash('sha256').update(fileBuffer).digest('hex').substring(0, 10),
109109
});
110110

111-
// 3. Determine if we're processing a PDF or image
112-
const isPdf = fileType === 'application/pdf';
111+
// 3. Process document
112+
const result = await this.processDocument(fileBuffer, fileType);
113113

114-
// 4. Extract text differently based on file type
115-
let result: ExtractedTextResult;
116-
117-
if (isPdf) {
118-
result = await this.processPdf(fileBuffer);
119-
} else {
120-
result = await this.processImage(fileBuffer);
121-
}
122-
123-
// 5. Calculate processing time
114+
// 4. Calculate processing time
124115
const processingTime = Date.now() - startTime;
125116

126117
this.logger.log(`Document processed in ${processingTime}ms`, {
@@ -150,38 +141,20 @@ export class AwsTextractService {
150141
}
151142

152143
/**
153-
* Process a single image file
144+
* Process a document (image or PDF)
154145
*/
155-
private async processImage(imageBuffer: Buffer): Promise<ExtractedTextResult> {
156-
this.logger.log('Processing single image with Textract');
146+
private async processDocument(
147+
documentBuffer: Buffer,
148+
documentType: string,
149+
): Promise<ExtractedTextResult> {
150+
this.logger.log(
151+
`Processing ${documentType === 'application/pdf' ? 'PDF document' : 'single image'} with Textract`,
152+
);
157153

158154
// Use Analyze Document API for more comprehensive analysis
159155
const command = new AnalyzeDocumentCommand({
160156
Document: {
161-
Bytes: imageBuffer,
162-
},
163-
FeatureTypes: ['TABLES', 'FORMS'],
164-
});
165-
166-
const response = await this.client.send(command);
167-
168-
return this.parseTextractResponse(response);
169-
}
170-
171-
/**
172-
* Process a multi-page PDF document
173-
*/
174-
private async processPdf(pdfBuffer: Buffer): Promise<ExtractedTextResult> {
175-
this.logger.log('Processing PDF document with Textract');
176-
177-
// For PDF, first start an async job with StartDocumentTextDetection
178-
// But for simplicity in this implementation, we'll process just the first page
179-
// For a complete solution, you'd use the async APIs with S3
180-
181-
// Use Analyze Document API with first page only as a simplified approach
182-
const command = new AnalyzeDocumentCommand({
183-
Document: {
184-
Bytes: pdfBuffer,
157+
Bytes: documentBuffer,
185158
},
186159
FeatureTypes: ['TABLES', 'FORMS'],
187160
});

0 commit comments

Comments
 (0)