samples/02-samples/12-medical-document-processing-assistant/document_processor.py at main · madhununna/samples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3

import os
import json
import boto3
import base64
from pypdf import PdfReader
from strands import tool

@tool
def process_document(file_path: str) -> str:
    """
    Process a medical document (PDF or image) and extract its content using Amazon Bedrock.

    Args:
        file_path: Path to the document file (PDF or image)

    Returns:
        Extracted text content from the document
    """
    if not os.path.exists(file_path):
        return json.dumps({"error": f"File not found: {file_path}"})

    file_extension = os.path.splitext(file_path)[1].lower()

    try:
        # Use Bedrock for all document types
        if file_extension == '.pdf':
            # For PDFs, try Bedrock first, then fall back to traditional PDF extraction
            try:
                return _use_bedrock_for_document(file_path)
            except Exception as e:
                print(f"Bedrock processing failed, falling back to PDF extraction: {str(e)}")
                return _process_pdf_traditional(file_path)

        # Process image files with Bedrock
        elif file_extension in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp']:
            return _use_bedrock_for_document(file_path)

        else:
            return json.dumps({"error": f"Unsupported file format: {file_extension}"})

    except Exception as e:
        return json.dumps({"error": f"Error processing document: {str(e)}"})

def _process_pdf_traditional(file_path: str) -> str:
    """Extract text from a PDF file using PyPDF."""
    try:
        # Extract text directly from PDF
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"

        # If text extraction yields meaningful content, return it
        if len(text.strip()) > 50:
            return text

        # If minimal text was extracted, the PDF might be scanned/image-based
        # In this case, we can't extract text using traditional methods
        return json.dumps({"error": "PDF appears to be image-based and requires Bedrock for processing"})

    except Exception as e:
        return json.dumps({"error": f"Error processing PDF: {str(e)}"})

def _use_bedrock_for_document(file_path: str) -> str:
    """Use Amazon Bedrock for document processing."""
    try:
        # Initialize Bedrock client
        bedrock_runtime = boto3.client(
            service_name='bedrock-runtime',
            region_name=os.environ.get('AWS_REGION', 'us-east-1')
        )

        # Read file as base64
        with open(file_path, 'rb') as file:
            file_bytes = file.read()
            base64_data = base64.b64encode(file_bytes).decode('utf-8')

        # Determine media type based on file extension
        file_extension = os.path.splitext(file_path)[1].lower()
        if file_extension == '.pdf':
            media_type = 'application/pdf'
        elif file_extension == '.png':
            media_type = 'image/png'
        elif file_extension in ['.jpg', '.jpeg']:
            media_type = 'image/jpeg'
        elif file_extension == '.tiff':
            media_type = 'image/tiff'
        elif file_extension == '.bmp':
            media_type = 'image/bmp'
        else:
            media_type = 'application/octet-stream'

        # Prepare request for Claude model
        model_id = os.environ.get('BEDROCK_MODEL_ID', 'anthropic.claude-3-sonnet-20240229-v1:0')

        request_body = {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 4096,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": media_type,
                                "data": base64_data
                            }
                        },
                        {
                            "type": "text",
                            "text": "Extract all text content from this medical document. Preserve the formatting as much as possible. Include all medical terms, diagnoses, medications, and treatments. Be thorough and capture all details from the document."
                        }
                    ]
                }
            ]
        }

        # Invoke Bedrock model
        response = bedrock_runtime.invoke_model(
            modelId=model_id,
            body=json.dumps(request_body)
        )

        # Parse response
        response_body = json.loads(response['body'].read().decode('utf-8'))
        extracted_text = response_body['content'][0]['text']

        return extracted_text

    except Exception as e:
        raise Exception(f"Error using Bedrock for document processing: {str(e)}")