@@ -46,46 +46,58 @@ def gcs_loader_func(file_path):
4646   return  loader 
4747
4848def  get_documents_from_gcs (gcs_project_id , gcs_bucket_name , gcs_bucket_folder , gcs_blob_filename , access_token = None ):
49-   nltk .download ('punkt' )
50-   nltk .download ('averaged_perceptron_tagger' )
51-   if  gcs_bucket_folder  is  not   None  and  gcs_bucket_folder .strip ()!= "" :
52-     if  gcs_bucket_folder .endswith ('/' ):
53-       blob_name  =  gcs_bucket_folder + gcs_blob_filename 
49+ 
50+   nltk .data .path .append ("/usr/local/nltk_data" )
51+   nltk .data .path .append (os .path .expanduser ("~/.nltk_data" ))
52+   try :
53+       nltk .data .find ("tokenizers/punkt" )
54+   except  LookupError :
55+     for  resource  in  ["punkt" , "averaged_perceptron_tagger" ]:
56+       try :
57+           nltk .data .find (f"tokenizers/{ resource }  "  if  resource  ==  "punkt"  else  f"taggers/{ resource }  " )
58+       except  LookupError :
59+           logging .info (f"Downloading NLTK resource: { resource }  " )
60+           nltk .download (resource , download_dir = os .path .expanduser ("~/.nltk_data" ))
61+           
62+     logging .info ("NLTK resources downloaded successfully." )
63+     if  gcs_bucket_folder  is  not   None  and  gcs_bucket_folder .strip ()!= "" :
64+       if  gcs_bucket_folder .endswith ('/' ):
65+         blob_name  =  gcs_bucket_folder + gcs_blob_filename 
66+       else :
67+         blob_name  =  gcs_bucket_folder + '/' + gcs_blob_filename  
5468    else :
55-       blob_name  =  gcs_bucket_folder + '/' + gcs_blob_filename  
56-   else :
57-       blob_name  =  gcs_blob_filename   
58-   
59-   logging .info (f"GCS project_id : { gcs_project_id }  " )  
60-  
61-   if  access_token  is  None :
62-     storage_client  =  storage .Client (project = gcs_project_id )
63-     bucket  =  storage_client .bucket (gcs_bucket_name )
64-     blob  =  bucket .blob (blob_name ) 
69+         blob_name  =  gcs_blob_filename   
6570
66-     if  blob .exists ():
67-         loader  =  GCSFileLoader (project_name = gcs_project_id , bucket = gcs_bucket_name , blob = blob_name , loader_func = gcs_loader_func )
68-         pages  =  loader .load () 
69-     else  :
70-       raise  LLMGraphBuilderException ('File does not exist, Please re-upload the file and try again.' )
71-   else :
72-     creds =  Credentials (access_token )
73-     storage_client  =  storage .Client (project = gcs_project_id , credentials = creds )
71+     logging .info (f"GCS project_id : { gcs_project_id }  " )  
7472
75-     bucket  =  storage_client .bucket (gcs_bucket_name )
76-     blob  =  bucket .blob (blob_name ) 
77-     if  blob .exists ():
78-       content  =  blob .download_as_bytes ()
79-       pdf_file  =  io .BytesIO (content )
80-       pdf_reader  =  PdfReader (pdf_file )
81-       # Extract text from all pages 
82-       text  =  "" 
83-       for  page  in  pdf_reader .pages :
84-             text  +=  page .extract_text ()
85-       pages  =  [Document (page_content  =  text )]
73+     if  access_token  is  None :
74+       storage_client  =  storage .Client (project = gcs_project_id )
75+       bucket  =  storage_client .bucket (gcs_bucket_name )
76+       blob  =  bucket .blob (blob_name ) 
77+       
78+       if  blob .exists ():
79+           loader  =  GCSFileLoader (project_name = gcs_project_id , bucket = gcs_bucket_name , blob = blob_name , loader_func = gcs_loader_func )
80+           pages  =  loader .load () 
81+       else  :
82+         raise  LLMGraphBuilderException ('File does not exist, Please re-upload the file and try again.' )
8683    else :
87-       raise  LLMGraphBuilderException (f'File Not Found in GCS bucket - { gcs_bucket_name }  ' )
88-   return  gcs_blob_filename , pages 
84+       creds =  Credentials (access_token )
85+       storage_client  =  storage .Client (project = gcs_project_id , credentials = creds )
86+     
87+       bucket  =  storage_client .bucket (gcs_bucket_name )
88+       blob  =  bucket .blob (blob_name ) 
89+       if  blob .exists ():
90+         content  =  blob .download_as_bytes ()
91+         pdf_file  =  io .BytesIO (content )
92+         pdf_reader  =  PdfReader (pdf_file )
93+         # Extract text from all pages 
94+         text  =  "" 
95+         for  page  in  pdf_reader .pages :
96+               text  +=  page .extract_text ()
97+         pages  =  [Document (page_content  =  text )]
98+       else :
99+         raise  LLMGraphBuilderException (f'File Not Found in GCS bucket - { gcs_bucket_name }  ' )
100+     return  gcs_blob_filename , pages 
89101
90102def  upload_file_to_gcs (file_chunk , chunk_number , original_file_name , bucket_name , folder_name_sha1_hashed ):
91103  try :
0 commit comments