@@ -187,71 +187,117 @@ def self.restore_default_users
187187 end
188188
189189 def self . import_training_documents
190- csv_data = CSV . read ( file_path ( "CMEFiles.csv" ) , headers : true )
190+ csv_data = load_training_documents_csv
191+ import_stats = initialize_import_stats
191192
192- csv_files_without_topics = [ ]
193- files_with_topics = [ ]
194- attached_files = [ ]
195- non_attached_files = [ ]
196- files_with_problems = [ ]
193+ valid_csv_rows = filter_rows_with_existing_topics ( csv_data , import_stats )
194+ azure_files = fetch_azure_files
195+ importable_rows = match_csv_with_azure_files ( valid_csv_rows , azure_files )
197196
198- # Filter training documents if there is a matching topic in the DB
199- csv_training_index = csv_data . filter_map do |row |
200- if Topic . find_by ( id : row [ "Topic_ID" ] . to_i )
197+ log_import_summary ( valid_csv_rows , azure_files , importable_rows )
198+
199+ process_document_attachments ( importable_rows , import_stats )
200+ log_final_results ( import_stats )
201+ end
202+
203+ private
204+
205+ def self . load_training_documents_csv
206+ CSV . read ( file_path ( "CMEFiles.csv" ) , headers : true )
207+ end
208+
209+ def self . initialize_import_stats
210+ {
211+ topics_without_csv : [ ] ,
212+ successful_attachments : [ ] ,
213+ failed_attachments : [ ] ,
214+ error_files : [ ]
215+ }
216+ end
217+
218+ def self . filter_rows_with_existing_topics ( csv_data , stats )
219+ csv_data . filter_map do |row |
220+ topic_id = row [ "Topic_ID" ] . to_i
221+ if Topic . find_by ( id : topic_id )
201222 row
202223 else
203- csv_files_without_topics << row [ "Topic_ID" ]
224+ stats [ :topics_without_csv ] << topic_id
204225 nil
205226 end
206227 end
228+ end
207229
208- # Pre-fetch all the training files from Azure for each language and state
209- all_azure_files = self . fetch_azure_files
210-
211- # Filter all fetched azure files against the csv_training_index by name
212- csv_with_azure_files = all_azure_files . filter_map do |file |
213- csv_training_index . find { |row | row [ "File_Name" ] == file [ :name ] }
230+ def self . match_csv_with_azure_files ( csv_rows , azure_files )
231+ azure_files . filter_map do |file |
232+ csv_rows . find { |row | row [ "File_Name" ] == file [ :name ] }
214233 end
234+ end
215235
216- puts "csv_training_index: #{ csv_training_index . size } "
217- puts "all_azure_files: #{ all_azure_files . size } "
218- puts "csv_with_azure_files: #{ csv_with_azure_files . size } "
219-
220- # Since we import only existing files in Azure, we rely on our filtered list
221- csv_with_azure_files . each do |row |
236+ def self . process_document_attachments ( rows , stats )
237+ rows . each do |row |
222238 topic = Topic . find_by ( id : row [ "Topic_ID" ] )
239+ next unless topic
223240
224- files_with_topics << topic
225- file_path = self . get_file_path ( topic . state , topic . language . name )
241+ attach_document_to_topic ( topic , row , stats )
242+ end
243+ end
226244
227- puts "Requesting: #{ file_path } /#{ row [ "File_Name" ] } "
245+ def self . attach_document_to_topic ( topic , row , stats )
246+ file_path = get_file_path ( topic . state , topic . language . name )
247+ filename = row [ "File_Name" ]
228248
229- begin
230- encoded_filename = URI . encode_www_form_component ( row [ "File_Name" ] )
231- file_content = AzureFileShares . client . files . download_file ( ENV [ "AZURE_STORAGE_SHARE_NAME" ] , file_path , encoded_filename )
249+ puts "Requesting: #{ file_path } /#{ filename } "
232250
233- topic . documents . attach (
234- io : StringIO . new ( file_content ) ,
235- filename : row [ "File_Name" ] ,
236- content_type : self . detect_content_type ( row [ "File_Type" ] )
237- )
251+ begin
252+ file_content = download_azure_file ( file_path , filename )
253+
254+ topic . documents . attach (
255+ io : StringIO . new ( file_content ) ,
256+ filename : filename ,
257+ content_type : detect_content_type ( row [ "File_Type" ] )
258+ )
238259
239- if topic . save!
240- attached_files << [ row , topic ]
241- else
242- non_attached_files << [ row , topic ]
243- end
244- rescue AzureFileShares ::Errors ::ApiError , URI ::InvalidURIError => e
245- files_with_problems << { topic : topic , file : row [ "File_Name" ] , error : e . message }
246- puts "Error with file: #{ row [ "File_Name" ] } for topic #{ topic . title } - #{ e . message } "
260+ if topic . save!
261+ stats [ :successful_attachments ] << [ row , topic ]
262+ else
263+ stats [ :failed_attachments ] << [ row , topic ]
247264 end
265+
266+ rescue AzureFileShares ::Errors ::ApiError , URI ::InvalidURIError => e
267+ handle_attachment_error ( topic , filename , e , stats )
248268 end
269+ end
270+
271+ def self . download_azure_file ( file_path , filename )
272+ encoded_filename = URI . encode_www_form_component ( filename )
273+ AzureFileShares . client . files . download_file (
274+ ENV [ "AZURE_STORAGE_SHARE_NAME" ] ,
275+ file_path ,
276+ encoded_filename
277+ )
278+ end
279+
280+ def self . handle_attachment_error ( topic , filename , error , stats )
281+ error_info = {
282+ topic : topic ,
283+ file : filename ,
284+ error : error . message
285+ }
286+ stats [ :error_files ] << error_info
287+ puts "Error with file: #{ filename } for topic #{ topic . title } - #{ error . message } "
288+ end
289+
290+ def self . log_import_summary ( csv_rows , azure_files , importable_rows )
291+ puts "CSV rows with topics: #{ csv_rows . size } "
292+ puts "Azure files found: #{ azure_files . size } "
293+ puts "Importable files: #{ importable_rows . size } "
294+ end
249295
250- puts "topics not found: #{ csv_files_without_topics . size } "
251- puts "topics found: #{ files_with_topics . size } "
252- puts "attached files : #{ attached_files . size } "
253- puts "non_attached_files : #{ non_attached_files . size } "
254- puts "files with problems : #{ files_with_problems . size } "
296+ def self . log_final_results ( stats )
297+ puts "Topics not found: #{ stats [ :topics_without_csv ] . size } "
298+ puts "Successful attachments : #{ stats [ :successful_attachments ] . size } "
299+ puts "Failed attachments : #{ stats [ :failed_attachments ] . size } "
300+ puts "Files with errors : #{ stats [ :error_files ] . size } "
255301 end
256302
257303 private
0 commit comments