diff --git a/app/jobs/scheduled/automatic_translation_backfill.rb b/app/jobs/scheduled/automatic_translation_backfill.rb index 37176a6e..d1d6d51f 100644 --- a/app/jobs/scheduled/automatic_translation_backfill.rb +++ b/app/jobs/scheduled/automatic_translation_backfill.rb @@ -18,35 +18,33 @@ def execute(args = nil) end end - def fetch_untranslated_model_ids( - model, - content_column, - limit, - target_locales = backfill_locales - ) + def fetch_untranslated_model_ids(model, content_column, limit, target_locale) m = model.name.downcase - DB.query_single(<<~SQL, target_locales: target_locales, limit: limit) + DB.query_single(<<~SQL, target_locale: target_locale, limit: limit) SELECT m.id - FROM #{m}s m - LEFT JOIN discourse_translator_#{m}_locales dl ON dl.#{m}_id = m.id - LEFT JOIN LATERAL ( - SELECT array_agg(DISTINCT locale)::text[] as locales - FROM discourse_translator_#{m}_translations dt - WHERE dt.#{m}_id = m.id - ) translations ON true - WHERE NOT ( - ARRAY[:target_locales]::text[] <@ - (COALESCE( - array_cat( - ARRAY[COALESCE(dl.detected_locale, '')]::text[], - COALESCE(translations.locales, ARRAY[]::text[]) - ), - ARRAY[]::text[] - )) - ) - AND m.deleted_at IS NULL - AND m.#{content_column} != '' - AND m.user_id > 0 + FROM #{model.table_name} m + WHERE m.deleted_at IS NULL + AND m.#{content_column} != '' + AND m.user_id > 0 + AND ( + NOT EXISTS ( + SELECT 1 + FROM discourse_translator_#{m}_locales + WHERE #{m}_id = m.id + ) + OR EXISTS ( + SELECT 1 + FROM discourse_translator_#{m}_locales + WHERE #{m}_id = m.id + AND detected_locale != :target_locale + ) + ) + AND NOT EXISTS ( + SELECT 1 + FROM discourse_translator_#{m}_translations + WHERE #{m}_id = m.id + AND locale = :target_locale + ) ORDER BY m.updated_at DESC LIMIT :limit SQL @@ -65,53 +63,60 @@ def secure_backfill_lock end def translations_per_run - [ - (SiteSetting.automatic_translation_backfill_maximum_translations_per_hour / 12) / - backfill_locales.size, - 1, - ].max + [(SiteSetting.automatic_translation_backfill_maximum_translations_per_hour / 12), 1].max end def backfill_locales - @backfill_locales ||= SiteSetting.automatic_translation_target_languages.split("|") + @backfill_locales ||= + SiteSetting.automatic_translation_target_languages.split("|").map { |l| l.gsub("_", "-") } end def translator @translator_klass ||= "DiscourseTranslator::#{SiteSetting.translator_provider}".constantize end - def translate_records(type, record_ids) + def translate_records(type, record_ids, target_locale) record_ids.each do |id| record = type.find(id) - backfill_locales.each do |target_locale| - begin - translator.translate(record, target_locale.to_sym) - rescue => e - # continue with other locales even if one fails - Rails.logger.warn( - "Failed to machine-translate #{type.name}##{id} to #{target_locale}: #{e.message}\n#{e.backtrace.join("\n")}", - ) - next - end + begin + translator.translate(record, target_locale.to_sym) + rescue => e + # continue with other locales even if one fails + Rails.logger.warn( + "Failed to machine-translate #{type.name}##{id} to #{target_locale}: #{e.message}\n#{e.backtrace.join("\n")}", + ) + next end end end def process_batch models_translated = [Post, Topic].size - translations_per_model = [translations_per_run / models_translated, 1].max - topic_ids = fetch_untranslated_model_ids(Topic, "title", translations_per_model) - translations_per_model = translations_per_run - topic_ids.size - post_ids = fetch_untranslated_model_ids(Post, "cooked", translations_per_model) - - DiscourseTranslator::VerboseLogger.log( - "Translating #{topic_ids.size} topics and #{post_ids.size} posts to #{backfill_locales.join(", ")}", - ) + avg_translations_per_model_per_language = [ + translations_per_run / models_translated / backfill_locales.size, + 1, + ].max - return if topic_ids.empty? && post_ids.empty? + records_to_translate = avg_translations_per_model_per_language + backfill_locales.each_with_index do |target_locale, i| + topic_ids = + fetch_untranslated_model_ids(Topic, "title", records_to_translate, target_locale) + post_ids = fetch_untranslated_model_ids(Post, "cooked", records_to_translate, target_locale) + + # if we end up translating fewer records than records_to_translate, + # add to the value so that the next locales can have more quota + records_to_translate = + avg_translations_per_model_per_language + + ((records_to_translate - topic_ids.size - post_ids.size) / backfill_locales.size - i) + next if topic_ids.empty? && post_ids.empty? + + DiscourseTranslator::VerboseLogger.log( + "Translating #{topic_ids.size} topics and #{post_ids.size} posts to #{backfill_locales.join(", ")}", + ) - translate_records(Topic, topic_ids) - translate_records(Post, post_ids) + translate_records(Topic, topic_ids, target_locale) + translate_records(Post, post_ids, target_locale) + end end end end diff --git a/spec/jobs/automatic_translation_backfill_spec.rb b/spec/jobs/automatic_translation_backfill_spec.rb index 3e1b5067..f91183b8 100644 --- a/spec/jobs/automatic_translation_backfill_spec.rb +++ b/spec/jobs/automatic_translation_backfill_spec.rb @@ -96,8 +96,8 @@ def expect_google_translate(text) topic.set_detected_locale("de") post.set_detected_locale("es") - expect_google_translate("hallo") expect_google_translate("hola") + expect_google_translate("hallo") described_class.new.execute @@ -108,7 +108,7 @@ def expect_google_translate(text) describe "with just one locale ['de']" do before do - SiteSetting.automatic_translation_backfill_maximum_translations_per_hour = 5 * 12 + SiteSetting.automatic_translation_backfill_maximum_translations_per_hour = 100 SiteSetting.automatic_translation_target_languages = "de" expect_google_check_language end @@ -149,64 +149,54 @@ def expect_google_translate(text) This is the scenario we are testing for: | Post ID | detected_locale | translations | selected? | Why? | |---------|-----------------|--------------|-----------|------| - | 1 | en | none | YES | source not de/es, needs both translations - | 2 | es | none | YES | source is es, but missing de translation - | 3 | null | es | YES | missing de translation - | 4 | null | de, es | NO | has both de and es translations - | 5 | de | es | NO | source is de and has es translation - | 6 | de | de | YES | both source and translation is de, missing es translation - | 7 | de | ja | YES | source is de, missing es translation + | 1 | en | none | YES | source not de + | 2 | null | es | YES | missing de translation + | 3 | null | de | NO | has de translation + | 4 | de | es | NO | source is de and has es translation + | 5 | de | de | NO | both source and translation is de, missing es translation + | 6 | null | none | YES | no detected locale nor translation =end - [posts_1, posts_2, posts_3].flatten.each do |post| - post.set_translation("es", "hola") - post.set_translation("de", "hallo") - end + [posts_1, posts_2, posts_3].flatten.each { |post| post.set_detected_locale("de") } post_1.set_detected_locale("en") - post_2.set_detected_locale("es") + post_4.set_detected_locale("de") post_5.set_detected_locale("de") - post_6.set_detected_locale("de") - post_7.set_detected_locale("de") - post_3.set_translation("es", "hola") - post_4.set_translation("de", "hallo") + post_2.set_translation("es", "hola") + post_3.set_translation("de", "hallo") post_4.set_translation("es", "hola") - post_5.set_translation("es", "hola") - post_6.set_translation("de", "hallo") - post_7.set_translation("ja", "こんにちは") + post_5.set_translation("de", "hallo") end it "returns correct post ids needing translation in descending updated_at" do - # based on the table above, we will return post_7, post_6, post_3, post_2, post_1 + # based on the table above, we will return post_6, post_2, post_1 # but we will jumble its updated_at to test if it is sorted correctly post_6.update!(updated_at: 1.day.ago) - post_3.update!(updated_at: 2.days.ago) + post_1.update!(updated_at: 2.days.ago) post_2.update!(updated_at: 3.days.ago) - post_1.update!(updated_at: 4.days.ago) - post_7.update!(updated_at: 5.days.ago) - result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es]) - expect(result).to include(post_6.id, post_3.id, post_2.id, post_1.id, post_7.id) + result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de") + expect(result).to include(post_6.id, post_1.id, post_2.id) end it "does not return posts that are deleted" do post_1.trash! - result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es]) + result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de") expect(result).not_to include(post_1.id) end it "does not return posts that are empty" do post_1.cooked = "" post_1.save!(validate: false) - result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es]) + result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de") expect(result).not_to include(post_1.id) end it "does not return posts by bots" do post_1.update(user: Discourse.system_user) - result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es]) + result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de") expect(result).not_to include(post_1.id) end