33import logging
44import os
55import re
6- from typing import List , Optional , Union , NamedTuple , Tuple
6+ from enum import Enum
7+ from typing import List , Optional , Union
78
89import fitz # type: ignore
910from azure .core .credentials_async import AsyncTokenCredential
1011from azure .storage .blob import (
12+ BlobClient ,
1113 BlobSasPermissions ,
1214 UserDelegationKey ,
13- generate_blob_sas ,
14- BlobClient
15+ generate_blob_sas ,
1516)
1617from azure .storage .blob .aio import BlobServiceClient , ContainerClient
1718from PIL import Image , ImageDraw , ImageFont
2122
2223logger = logging .getLogger ("scripts" )
2324
25+
2426class BlobManager :
2527 """
2628 Class to manage uploading and deleting blobs containing citation information from a blob storage account
@@ -45,58 +47,60 @@ def __init__(
4547 self .subscriptionId = subscriptionId
4648 self .user_delegation_key : Optional [UserDelegationKey ] = None
4749
48- #async def upload_blob(self, file: File, container_client:ContainerClient) -> Optional[List[str]]:
49-
50- async def _create_new_blob (self , file : File , container_client :ContainerClient ) -> BlobClient :
50+ async def _create_new_blob (self , file : File , container_client : ContainerClient ) -> BlobClient :
5151 with open (file .content .name , "rb" ) as reopened_file :
52- blob_name = BlobManager .blob_name_from_file_name (file .content .name )
53- logger .info ("Uploading blob for whole file -> %s" , blob_name )
54- return await container_client .upload_blob (blob_name , reopened_file , overwrite = True , metadata = file .metadata )
52+ blob_name = BlobManager .blob_name_from_file_name (file .content .name )
53+ logger .info ("Uploading blob for whole file -> %s" , blob_name )
54+ return await container_client .upload_blob (blob_name , reopened_file , overwrite = True , metadata = file .metadata )
5555
56- async def _file_blob_update_needed (self , blob_client : BlobClient , file : File ) -> bool :
57- md5_check : int = 0 # 0= not done, 1, positive,. 2 negative
56+ async def _file_blob_update_needed (self , blob_client : BlobClient , file : File ) -> bool :
5857 # Get existing blob properties
5958 blob_properties = await blob_client .get_blob_properties ()
6059 blob_metadata = blob_properties .metadata
61-
60+
6261 # Check if the md5 values are the same
63- file_md5 = file .metadata .get ('md5' )
64- blob_md5 = blob_metadata .get ('md5' )
65-
66- # Remove md5 from file metadata if it matches the blob metadata
67- if file_md5 and file_md5 != blob_md5 :
68- return True
69- else :
70- return False
71-
62+ file_md5 = file .metadata .get ("md5" )
63+ blob_md5 = blob_metadata .get ("md5" )
64+
65+ # If the file has an md5 value, check if it is different from the blob
66+ return file_md5 and file_md5 != blob_md5
67+
7268 async def upload_blob (self , file : File ) -> Optional [List [str ]]:
7369 async with BlobServiceClient (
7470 account_url = self .endpoint , credential = self .credential , max_single_put_size = 4 * 1024 * 1024
7571 ) as service_client , service_client .get_container_client (self .container ) as container_client :
7672 if not await container_client .exists ():
7773 await container_client .create_container ()
78-
79- # Re-open and upload the original file
80- md5_check : int = 0 # 0= not done, 1, positive,. 2 negative
81-
82- # upload the file local storage zu azure storage
74+
75+ # Re-open and upload the original file if the blob does not exist or the md5 values do not match
76+ class MD5Check (Enum ):
77+ NOT_DONE = 0
78+ MATCH = 1
79+ NO_MATCH = 2
80+
81+ md5_check = MD5Check .NOT_DONE
82+
83+ # Upload the file to Azure Storage
8384 # file.url is only None if files are not uploaded yet, for datalake it is set
8485 if file .url is None :
8586 blob_client = container_client .get_blob_client (file .url )
8687
8788 if not await blob_client .exists ():
89+ logger .info ("Blob %s does not exist, uploading" , file .url )
8890 blob_client = await self ._create_new_blob (file , container_client )
8991 else :
9092 if self ._blob_update_needed (blob_client , file ):
91- md5_check = 2
93+ logger .info ("Blob %s exists but md5 values do not match, updating" , file .url )
94+ md5_check = MD5Check .NO_MATCH
9295 # Upload the file with the updated metadata
9396 with open (file .content .name , "rb" ) as data :
9497 await blob_client .upload_blob (data , overwrite = True , metadata = file .metadata )
9598 else :
96- md5_check = 1
99+ logger .info ("Blob %s exists and md5 values match, skipping upload" , file .url )
100+ md5_check = MD5Check .MATCH
97101 file .url = blob_client .url
98-
99- if md5_check != 1 and self .store_page_images :
102+
103+ if md5_check != MD5Check . MATCH and self .store_page_images :
100104 if os .path .splitext (file .content .name )[1 ].lower () == ".pdf" :
101105 return await self .upload_pdf_blob_images (service_client , container_client , file )
102106 else :
@@ -127,20 +131,19 @@ async def upload_pdf_blob_images(
127131
128132 for i in range (page_count ):
129133 blob_name = BlobManager .blob_image_name_from_file_page (file .content .name , i )
130-
134+
131135 blob_client = container_client .get_blob_client (blob_name )
132- do_upload : bool = True
133136 if await blob_client .exists ():
134137 # Get existing blob properties
135138 blob_properties = await blob_client .get_blob_properties ()
136139 blob_metadata = blob_properties .metadata
137-
140+
138141 # Check if the md5 values are the same
139- file_md5 = file .metadata .get (' md5' )
140- blob_md5 = blob_metadata .get (' md5' )
142+ file_md5 = file .metadata .get (" md5" )
143+ blob_md5 = blob_metadata .get (" md5" )
141144 if file_md5 == blob_md5 :
142- continue # documemt already uploaded
143-
145+ continue # documemt already uploaded
146+
144147 logger .debug ("Converting page %s to image and uploading -> %s" , i , blob_name )
145148
146149 doc = fitz .open (file .content .name )
@@ -167,7 +170,7 @@ async def upload_pdf_blob_images(
167170 output = io .BytesIO ()
168171 new_img .save (output , format = "PNG" )
169172 output .seek (0 )
170-
173+
171174 await blob_client .upload_blob (data = output , overwrite = True , metadata = file .metadata )
172175 if not self .user_delegation_key :
173176 self .user_delegation_key = await service_client .get_user_delegation_key (start_time , expiry_time )
@@ -181,7 +184,7 @@ async def upload_pdf_blob_images(
181184 permission = BlobSasPermissions (read = True ),
182185 expiry = expiry_time ,
183186 start = start_time ,
184- )
187+ )
185188 sas_uris .append (f"{ blob_client .url } ?{ sas_token } " )
186189
187190 return sas_uris
0 commit comments