@@ -26,6 +26,7 @@ def self.import_all
2626 import_topics
2727 import_tags
2828 import_topic_tags
29+ import_training_documents
2930 restore_default_users
3031 end
3132
@@ -184,4 +185,163 @@ def self.restore_default_users
184185
185186 Provider . first . users << me unless Provider . first . users . include? ( me )
186187 end
188+
189+ def self . import_training_documents
190+ csv_data = load_training_documents_csv
191+ import_stats = initialize_import_stats
192+
193+ valid_csv_rows = filter_rows_with_existing_topics ( csv_data , import_stats )
194+ azure_files = fetch_azure_files
195+ importable_rows = match_csv_with_azure_files ( valid_csv_rows , azure_files )
196+
197+ log_import_summary ( valid_csv_rows , azure_files , importable_rows )
198+
199+ process_document_attachments ( importable_rows , import_stats )
200+ log_final_results ( import_stats )
201+ end
202+
203+ private
204+
205+ def self . load_training_documents_csv
206+ CSV . read ( file_path ( "CMEFiles.csv" ) , headers : true )
207+ end
208+
209+ def self . initialize_import_stats
210+ {
211+ topics_without_csv : [ ] ,
212+ successful_attachments : [ ] ,
213+ failed_attachments : [ ] ,
214+ error_files : [ ] ,
215+ }
216+ end
217+
218+ def self . filter_rows_with_existing_topics ( csv_data , stats )
219+ csv_data . filter_map do |row |
220+ topic_id = row [ "Topic_ID" ] . to_i
221+ if Topic . find_by ( id : topic_id )
222+ row
223+ else
224+ stats [ :topics_without_csv ] << topic_id
225+ nil
226+ end
227+ end
228+ end
229+
230+ def self . match_csv_with_azure_files ( csv_rows , azure_files )
231+ azure_files . filter_map do |file |
232+ csv_rows . find { |row | row [ "File_Name" ] == file [ :name ] }
233+ end
234+ end
235+
236+ def self . process_document_attachments ( rows , stats )
237+ rows . each do |row |
238+ topic = Topic . find_by ( id : row [ "Topic_ID" ] )
239+ next unless topic
240+
241+ attach_document_to_topic ( topic , row , stats )
242+ end
243+ end
244+
245+ def self . attach_document_to_topic ( topic , row , stats )
246+ file_path = get_file_path ( topic . state , topic . language . name )
247+ filename = row [ "File_Name" ]
248+
249+ puts "Requesting: #{ file_path } /#{ filename } "
250+
251+ begin
252+ file_content = download_azure_file ( file_path , filename )
253+
254+ topic . documents . attach (
255+ io : StringIO . new ( file_content ) ,
256+ filename : filename ,
257+ content_type : detect_content_type ( row [ "File_Type" ] )
258+ )
259+
260+ if topic . save!
261+ stats [ :successful_attachments ] << [ row , topic ]
262+ else
263+ stats [ :failed_attachments ] << [ row , topic ]
264+ end
265+
266+ rescue AzureFileShares ::Errors ::ApiError , URI ::InvalidURIError => e
267+ handle_attachment_error ( topic , filename , e , stats )
268+ end
269+ end
270+
271+ def self . download_azure_file ( file_path , filename )
272+ encoded_filename = URI . encode_www_form_component ( filename )
273+ AzureFileShares . client . files . download_file (
274+ ENV [ "AZURE_STORAGE_SHARE_NAME" ] ,
275+ file_path ,
276+ encoded_filename
277+ )
278+ end
279+
280+ def self . handle_attachment_error ( topic , filename , error , stats )
281+ error_info = {
282+ topic : topic ,
283+ file : filename ,
284+ error : error . message ,
285+ }
286+ stats [ :error_files ] << error_info
287+ puts "Error with file: #{ filename } for topic #{ topic . title } - #{ error . message } "
288+ end
289+
290+ def self . log_import_summary ( csv_rows , azure_files , importable_rows )
291+ puts "CSV rows with topics: #{ csv_rows . size } "
292+ puts "Azure files found: #{ azure_files . size } "
293+ puts "Importable files: #{ importable_rows . size } "
294+ end
295+
296+ def self . log_final_results ( stats )
297+ puts "Topics not found: #{ stats [ :topics_without_csv ] . size } "
298+ puts "Successful attachments: #{ stats [ :successful_attachments ] . size } "
299+ puts "Failed attachments: #{ stats [ :failed_attachments ] . size } "
300+ puts "Files with errors: #{ stats [ :error_files ] . size } "
301+ end
302+
303+ private
304+
305+ def self . get_file_path ( state , language )
306+ case [ state , language ]
307+ in [ "active" , "english" ]
308+ "CMES-Pi/assets/Content"
309+ in [ "archived" , "english" ]
310+ "CMES-Pi_Archive"
311+ in [ "active" , "spanish" ]
312+ "SP_CMES-Pi/assets/Content"
313+ in [ "archived" , "spanish" ]
314+ "SP_CMES-Pi_Archive"
315+ end
316+ end
317+
318+ def self . fetch_azure_files
319+ client = AzureFileShares . client
320+ azure_active_en = client . files . list ( ENV [ "AZURE_STORAGE_SHARE_NAME" ] , self . get_file_path ( "active" , "english" ) )
321+ azure_active_es = client . files . list ( ENV [ "AZURE_STORAGE_SHARE_NAME" ] , self . get_file_path ( "active" , "spanish" ) )
322+ azure_archived_en = client . files . list ( ENV [ "AZURE_STORAGE_SHARE_NAME" ] , self . get_file_path ( "archived" , "english" ) )
323+ azure_archived_es = client . files . list ( ENV [ "AZURE_STORAGE_SHARE_NAME" ] , self . get_file_path ( "archived" , "spanish" ) )
324+
325+ [
326+ azure_active_en [ :files ] ,
327+ azure_active_es [ :files ] ,
328+ azure_archived_en [ :files ] ,
329+ azure_archived_es [ :files ] ,
330+ ] . flatten
331+ end
332+
333+ def self . detect_content_type ( filename )
334+ case File . extname ( filename ) . downcase
335+ when ".mp3"
336+ "audio/mpeg"
337+ when ".pdf"
338+ "application/pdf"
339+ when ".jpg" , ".jpeg"
340+ "image/jpeg"
341+ when ".png"
342+ "image/png"
343+ else
344+ "application/octet-stream"
345+ end
346+ end
187347end
0 commit comments