Skip to content

Commit 8dc9d57

Browse files
committed
fixup! feat(import): encode file name #119
1 parent 66a0370 commit 8dc9d57

File tree

1 file changed

+92
-46
lines changed

1 file changed

+92
-46
lines changed

lib/autorequire/data_import.rb

Lines changed: 92 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -187,71 +187,117 @@ def self.restore_default_users
187187
end
188188

189189
def self.import_training_documents
190-
csv_data = CSV.read(file_path("CMEFiles.csv"), headers: true)
190+
csv_data = load_training_documents_csv
191+
import_stats = initialize_import_stats
191192

192-
csv_files_without_topics = []
193-
files_with_topics = []
194-
attached_files = []
195-
non_attached_files = []
196-
files_with_problems = []
193+
valid_csv_rows = filter_rows_with_existing_topics(csv_data, import_stats)
194+
azure_files = fetch_azure_files
195+
importable_rows = match_csv_with_azure_files(valid_csv_rows, azure_files)
197196

198-
# Filter training documents if there is a matching topic in the DB
199-
csv_training_index = csv_data.filter_map do |row|
200-
if Topic.find_by(id: row["Topic_ID"].to_i)
197+
log_import_summary(valid_csv_rows, azure_files, importable_rows)
198+
199+
process_document_attachments(importable_rows, import_stats)
200+
log_final_results(import_stats)
201+
end
202+
203+
private
204+
205+
def self.load_training_documents_csv
206+
CSV.read(file_path("CMEFiles.csv"), headers: true)
207+
end
208+
209+
def self.initialize_import_stats
210+
{
211+
topics_without_csv: [],
212+
successful_attachments: [],
213+
failed_attachments: [],
214+
error_files: []
215+
}
216+
end
217+
218+
def self.filter_rows_with_existing_topics(csv_data, stats)
219+
csv_data.filter_map do |row|
220+
topic_id = row["Topic_ID"].to_i
221+
if Topic.find_by(id: topic_id)
201222
row
202223
else
203-
csv_files_without_topics << row["Topic_ID"]
224+
stats[:topics_without_csv] << topic_id
204225
nil
205226
end
206227
end
228+
end
207229

208-
# Pre-fetch all the training files from Azure for each language and state
209-
all_azure_files = self.fetch_azure_files
210-
211-
# Filter all fetched azure files against the csv_training_index by name
212-
csv_with_azure_files = all_azure_files.filter_map do |file|
213-
csv_training_index.find { |row| row["File_Name"] == file[:name] }
230+
def self.match_csv_with_azure_files(csv_rows, azure_files)
231+
azure_files.filter_map do |file|
232+
csv_rows.find { |row| row["File_Name"] == file[:name] }
214233
end
234+
end
215235

216-
puts "csv_training_index: #{csv_training_index.size}"
217-
puts "all_azure_files: #{all_azure_files.size}"
218-
puts "csv_with_azure_files: #{csv_with_azure_files.size}"
219-
220-
# Since we import only existing files in Azure, we rely on our filtered list
221-
csv_with_azure_files.each do |row|
236+
def self.process_document_attachments(rows, stats)
237+
rows.each do |row|
222238
topic = Topic.find_by(id: row["Topic_ID"])
239+
next unless topic
223240

224-
files_with_topics << topic
225-
file_path = self.get_file_path(topic.state, topic.language.name)
241+
attach_document_to_topic(topic, row, stats)
242+
end
243+
end
226244

227-
puts "Requesting: #{file_path}/#{row["File_Name"]}"
245+
def self.attach_document_to_topic(topic, row, stats)
246+
file_path = get_file_path(topic.state, topic.language.name)
247+
filename = row["File_Name"]
228248

229-
begin
230-
encoded_filename = URI.encode_www_form_component(row["File_Name"])
231-
file_content = AzureFileShares.client.files.download_file(ENV["AZURE_STORAGE_SHARE_NAME"], file_path, encoded_filename)
249+
puts "Requesting: #{file_path}/#{filename}"
232250

233-
topic.documents.attach(
234-
io: StringIO.new(file_content),
235-
filename: row["File_Name"],
236-
content_type: self.detect_content_type(row["File_Type"])
237-
)
251+
begin
252+
file_content = download_azure_file(file_path, filename)
253+
254+
topic.documents.attach(
255+
io: StringIO.new(file_content),
256+
filename: filename,
257+
content_type: detect_content_type(row["File_Type"])
258+
)
238259

239-
if topic.save!
240-
attached_files << [ row, topic ]
241-
else
242-
non_attached_files << [ row, topic ]
243-
end
244-
rescue AzureFileShares::Errors::ApiError, URI::InvalidURIError => e
245-
files_with_problems << { topic: topic, file: row["File_Name"], error: e.message }
246-
puts "Error with file: #{row["File_Name"]} for topic #{topic.title} - #{e.message}"
260+
if topic.save!
261+
stats[:successful_attachments] << [row, topic]
262+
else
263+
stats[:failed_attachments] << [row, topic]
247264
end
265+
266+
rescue AzureFileShares::Errors::ApiError, URI::InvalidURIError => e
267+
handle_attachment_error(topic, filename, e, stats)
248268
end
269+
end
270+
271+
def self.download_azure_file(file_path, filename)
272+
encoded_filename = URI.encode_www_form_component(filename)
273+
AzureFileShares.client.files.download_file(
274+
ENV["AZURE_STORAGE_SHARE_NAME"],
275+
file_path,
276+
encoded_filename
277+
)
278+
end
279+
280+
def self.handle_attachment_error(topic, filename, error, stats)
281+
error_info = {
282+
topic: topic,
283+
file: filename,
284+
error: error.message
285+
}
286+
stats[:error_files] << error_info
287+
puts "Error with file: #{filename} for topic #{topic.title} - #{error.message}"
288+
end
289+
290+
def self.log_import_summary(csv_rows, azure_files, importable_rows)
291+
puts "CSV rows with topics: #{csv_rows.size}"
292+
puts "Azure files found: #{azure_files.size}"
293+
puts "Importable files: #{importable_rows.size}"
294+
end
249295

250-
puts "topics not found: #{csv_files_without_topics.size}"
251-
puts "topics found: #{files_with_topics.size}"
252-
puts "attached files: #{attached_files.size}"
253-
puts "non_attached_files: #{non_attached_files.size}"
254-
puts "files with problems: #{files_with_problems.size}"
296+
def self.log_final_results(stats)
297+
puts "Topics not found: #{stats[:topics_without_csv].size}"
298+
puts "Successful attachments: #{stats[:successful_attachments].size}"
299+
puts "Failed attachments: #{stats[:failed_attachments].size}"
300+
puts "Files with errors: #{stats[:error_files].size}"
255301
end
256302

257303
private

0 commit comments

Comments
 (0)