|
6 | 6 | import boto3
|
7 | 7 | from botocore.exceptions import NoCredentialsError
|
8 | 8 | from io import BytesIO
|
| 9 | +import tempfile |
| 10 | + |
9 | 11 |
|
10 | 12 | s3_access_key = "AKIAZTHHIOR4CN6UXO6N"
|
11 | 13 | s3_secret_access_key = "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"
|
@@ -34,91 +36,99 @@ def download_files_from_s3(bucket_name, prefix, local_directory):
|
34 | 36 |
|
35 | 37 |
|
36 | 38 |
|
37 |
| -def pdf_to_images(pdf_path, output_folder): |
| 39 | +def pdf_to_images_from_bytes(pdf_content, output_folder, file_name): |
| 40 | + s3_bucket_name = 'learnmateai' |
| 41 | + |
| 42 | + # Save PDF content to a temporary file |
| 43 | + with tempfile.NamedTemporaryFile(delete=False) as temp_file: |
| 44 | + temp_filename = temp_file.name |
| 45 | + temp_file.write(pdf_content) |
38 | 46 |
|
39 | 47 | # Convert PDF pages to images
|
40 |
| - images = convert_from_path(pdf_path) |
41 |
| - |
42 |
| - # Create the output folder if it doesn't exist |
43 |
| - if not os.path.exists(output_folder): |
44 |
| - os.makedirs(output_folder) |
| 48 | + images = convert_from_path(temp_filename) |
| 49 | + |
| 50 | + # Remove the temporary file |
| 51 | + os.remove(temp_filename) |
45 | 52 |
|
46 |
| - # Save each image in the specified output folder |
| 53 | + # Save each image to S3 |
47 | 54 | image_paths = []
|
48 | 55 | for i, image in enumerate(images):
|
49 |
| - image_path = os.path.join(output_folder, f'page_{i+1}.jpeg') |
50 |
| - image.save(image_path, 'JPEG') |
51 |
| - image_paths.append(image_path) |
| 56 | + image_bytes = BytesIO() |
| 57 | + image.save(image_bytes, 'JPEG') |
| 58 | + image_bytes.seek(0) |
| 59 | + |
| 60 | + image_key = f'{output_folder}/page_{i+1}.jpeg' |
| 61 | + s3.put_object(Body=image_bytes, Bucket=s3_bucket_name, Key=image_key) |
| 62 | + |
| 63 | + image_paths.append(image_key) |
52 | 64 | noImg = i+1
|
53 |
| - return image_paths,noImg |
| 65 | + return image_paths, noImg |
54 | 66 |
|
55 | 67 | @router.get("/notestotext")
|
56 | 68 | def NotesToText_handler():
|
57 | 69 | substring_to_remove = "Scanned by CamScanner"
|
| 70 | + s3_bucket_name = 'learnmateai' |
| 71 | + |
58 | 72 |
|
59 | 73 | prefix = 'notes_pdf/'
|
60 |
| - local_directory = 'Local_Storage/notes_pdf' |
61 |
| - |
62 |
| - # Create the local directory if it doesn't exist |
63 |
| - os.makedirs(local_directory, exist_ok=True) |
64 |
| - |
65 |
| - # Download files from S3 |
66 |
| - download_files_from_s3(s3_bucket_name, prefix, local_directory) |
67 | 74 |
|
68 |
| - folder_path = "Local_Storage/notes_pdf" |
69 |
| - |
70 |
| - # Get all files in the folder |
71 |
| - mod_files = os.listdir(folder_path) |
72 |
| - |
73 |
| - # Print the file names |
74 |
| - for file_name in mod_files: |
75 |
| - file_name=file_name.split(".")[0] |
76 |
| - |
| 75 | + # List files in the S3 bucket with the specified prefix |
| 76 | + response = s3.list_objects_v2(Bucket=s3_bucket_name, Prefix=prefix) |
| 77 | + |
| 78 | + # Extract the file names from the response |
| 79 | + files = [obj['Key'] for obj in response.get('Contents', [])] |
| 80 | + |
| 81 | + # Process each file |
| 82 | + for file_name in files: |
| 83 | + file_name = os.path.splitext(os.path.basename(file_name))[0] |
| 84 | + |
77 | 85 | print(f"converting {file_name}....")
|
78 |
| - pdf_path = f'Local_Storage/notes_pdf/{file_name}.pdf' |
| 86 | + |
| 87 | + # Download the PDF file from S3 |
| 88 | + pdf_object = s3.get_object(Bucket=s3_bucket_name, Key=f'{prefix}{file_name}.pdf') |
| 89 | + pdf_content = pdf_object['Body'].read() |
| 90 | + |
| 91 | + # Create the output folder in S3 |
79 | 92 | output_folder = f'images/Notes_images/{file_name}'
|
| 93 | + s3.put_object(Body='', Bucket=s3_bucket_name, Key=f'{output_folder}/') |
80 | 94 |
|
81 |
| - # Convert the PDF to images and save them in the output folder |
82 |
| - image_paths, noImg = pdf_to_images(pdf_path, output_folder) |
| 95 | + # Convert the PDF to images and save them in the output folder in S3 |
| 96 | + image_paths, noImg = pdf_to_images_from_bytes(pdf_content, output_folder, file_name) |
83 | 97 | print(noImg)
|
84 | 98 |
|
85 | 99 | os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'Files/client_file_vision.json'
|
86 | 100 | client = vision.ImageAnnotatorClient()
|
87 |
| - |
| 101 | + |
88 | 102 | # [START vision_python_migration_text_detection]
|
89 | 103 | image_contents = " "
|
90 | 104 |
|
91 | 105 | for j in range(noImg):
|
92 |
| - image_path = f'images/Notes_images/{file_name}/page_{j+1}.jpeg' |
93 |
| - with open(image_path, 'rb') as image_file: |
94 |
| - content = image_file.read() |
95 |
| - image = vision.Image(content=content) |
96 |
| - response = client.text_detection(image=image) |
97 |
| - texts = response.text_annotations[0] |
98 |
| - text = str(texts.description) |
99 |
| - image_contents += text.replace(substring_to_remove, "") |
100 |
| - |
101 |
| - |
102 |
| - output_file = f"Local_Storage/notes_txt/{file_name}.txt" |
103 |
| - # Write the text content to the output file |
104 |
| - with open(output_file, "w",encoding="utf-8") as file: |
105 |
| - file.write(image_contents) |
106 |
| - print(f"{file_name} completed") |
| 106 | + image_path = f'{output_folder}/page_{j+1}.jpeg' |
| 107 | + |
| 108 | + # Download the image from S3 |
| 109 | + image_object = s3.get_object(Bucket=s3_bucket_name, Key=image_path) |
| 110 | + image_content = image_object['Body'].read() |
| 111 | + |
| 112 | + content = vision.Image(content=image_content) |
| 113 | + response = client.text_detection(image=content) |
| 114 | + texts = response.text_annotations[0] |
| 115 | + text = str(texts.description) |
| 116 | + image_contents += text.replace(substring_to_remove, "") |
107 | 117 |
|
108 | 118 | s3_key = f'notes_txt/{file_name}.txt'
|
109 |
| - |
110 |
| - # Write the text content to the output file |
| 119 | + |
| 120 | + # Upload the text content to S3 |
111 | 121 | s3.put_object(
|
112 | 122 | Body=image_contents,
|
113 | 123 | Bucket=s3_bucket_name,
|
114 | 124 | Key=s3_key
|
115 | 125 | )
|
116 |
| - |
| 126 | + |
117 | 127 | if response.error.message:
|
118 | 128 | raise Exception(
|
119 |
| - '{}\nFor more info on error messages, check: ' |
120 |
| - 'https://cloud.google.com/apis/design/errors'.format( |
121 |
| - response.error.message)) |
| 129 | + '{}\nFor more info on error messages, check: ' |
| 130 | + 'https://cloud.google.com/apis/design/errors'.format( |
| 131 | + response.error.message)) |
122 | 132 |
|
123 | 133 |
|
124 | 134 |
|
|
0 commit comments