Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 60 additions & 55 deletions app/jobs/scheduled/automatic_translation_backfill.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,35 +18,33 @@ def execute(args = nil)
end
end

def fetch_untranslated_model_ids(
model,
content_column,
limit,
target_locales = backfill_locales
)
def fetch_untranslated_model_ids(model, content_column, limit, target_locale)
m = model.name.downcase
DB.query_single(<<~SQL, target_locales: target_locales, limit: limit)
DB.query_single(<<~SQL, target_locale: target_locale, limit: limit)
SELECT m.id
FROM #{m}s m
LEFT JOIN discourse_translator_#{m}_locales dl ON dl.#{m}_id = m.id
LEFT JOIN LATERAL (
SELECT array_agg(DISTINCT locale)::text[] as locales
FROM discourse_translator_#{m}_translations dt
WHERE dt.#{m}_id = m.id
) translations ON true
WHERE NOT (
ARRAY[:target_locales]::text[] <@
(COALESCE(
array_cat(
ARRAY[COALESCE(dl.detected_locale, '')]::text[],
COALESCE(translations.locales, ARRAY[]::text[])
),
ARRAY[]::text[]
))
)
AND m.deleted_at IS NULL
AND m.#{content_column} != ''
AND m.user_id > 0
FROM #{model.table_name} m
WHERE m.deleted_at IS NULL
AND m.#{content_column} != ''
AND m.user_id > 0
AND (
NOT EXISTS (
SELECT 1
FROM discourse_translator_#{m}_locales
WHERE #{m}_id = m.id
)
OR EXISTS (
SELECT 1
FROM discourse_translator_#{m}_locales
WHERE #{m}_id = m.id
AND detected_locale != :target_locale
)
)
AND NOT EXISTS (
SELECT 1
FROM discourse_translator_#{m}_translations
WHERE #{m}_id = m.id
AND locale = :target_locale
)
ORDER BY m.updated_at DESC
LIMIT :limit
SQL
Expand All @@ -65,53 +63,60 @@ def secure_backfill_lock
end

def translations_per_run
[
(SiteSetting.automatic_translation_backfill_maximum_translations_per_hour / 12) /
backfill_locales.size,
1,
].max
[(SiteSetting.automatic_translation_backfill_maximum_translations_per_hour / 12), 1].max
end

def backfill_locales
@backfill_locales ||= SiteSetting.automatic_translation_target_languages.split("|")
@backfill_locales ||=
SiteSetting.automatic_translation_target_languages.split("|").map { |l| l.gsub("_", "-") }
end

def translator
@translator_klass ||= "DiscourseTranslator::#{SiteSetting.translator_provider}".constantize
end

def translate_records(type, record_ids)
def translate_records(type, record_ids, target_locale)
record_ids.each do |id|
record = type.find(id)
backfill_locales.each do |target_locale|
begin
translator.translate(record, target_locale.to_sym)
rescue => e
# continue with other locales even if one fails
Rails.logger.warn(
"Failed to machine-translate #{type.name}##{id} to #{target_locale}: #{e.message}\n#{e.backtrace.join("\n")}",
)
next
end
begin
translator.translate(record, target_locale.to_sym)
rescue => e
# continue with other locales even if one fails
Rails.logger.warn(
"Failed to machine-translate #{type.name}##{id} to #{target_locale}: #{e.message}\n#{e.backtrace.join("\n")}",
)
next
end
end
end

def process_batch
models_translated = [Post, Topic].size
translations_per_model = [translations_per_run / models_translated, 1].max
topic_ids = fetch_untranslated_model_ids(Topic, "title", translations_per_model)
translations_per_model = translations_per_run - topic_ids.size
post_ids = fetch_untranslated_model_ids(Post, "cooked", translations_per_model)

DiscourseTranslator::VerboseLogger.log(
"Translating #{topic_ids.size} topics and #{post_ids.size} posts to #{backfill_locales.join(", ")}",
)
avg_translations_per_model_per_language = [
translations_per_run / models_translated / backfill_locales.size,
1,
].max

return if topic_ids.empty? && post_ids.empty?
records_to_translate = avg_translations_per_model_per_language
backfill_locales.each_with_index do |target_locale, i|
topic_ids =
fetch_untranslated_model_ids(Topic, "title", records_to_translate, target_locale)
post_ids = fetch_untranslated_model_ids(Post, "cooked", records_to_translate, target_locale)

# if we end up translating fewer records than records_to_translate,
# add to the value so that the next locales can have more quota
records_to_translate =
avg_translations_per_model_per_language +
((records_to_translate - topic_ids.size - post_ids.size) / backfill_locales.size - i)
next if topic_ids.empty? && post_ids.empty?

DiscourseTranslator::VerboseLogger.log(
"Translating #{topic_ids.size} topics and #{post_ids.size} posts to #{backfill_locales.join(", ")}",
)

translate_records(Topic, topic_ids)
translate_records(Post, post_ids)
translate_records(Topic, topic_ids, target_locale)
translate_records(Post, post_ids, target_locale)
end
end
end
end
50 changes: 20 additions & 30 deletions spec/jobs/automatic_translation_backfill_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ def expect_google_translate(text)
topic.set_detected_locale("de")
post.set_detected_locale("es")

expect_google_translate("hallo")
expect_google_translate("hola")
expect_google_translate("hallo")

described_class.new.execute

Expand All @@ -108,7 +108,7 @@ def expect_google_translate(text)

describe "with just one locale ['de']" do
before do
SiteSetting.automatic_translation_backfill_maximum_translations_per_hour = 5 * 12
SiteSetting.automatic_translation_backfill_maximum_translations_per_hour = 100
SiteSetting.automatic_translation_target_languages = "de"
expect_google_check_language
end
Expand Down Expand Up @@ -149,64 +149,54 @@ def expect_google_translate(text)
This is the scenario we are testing for:
| Post ID | detected_locale | translations | selected? | Why? |
|---------|-----------------|--------------|-----------|------|
| 1 | en | none | YES | source not de/es, needs both translations
| 2 | es | none | YES | source is es, but missing de translation
| 3 | null | es | YES | missing de translation
| 4 | null | de, es | NO | has both de and es translations
| 5 | de | es | NO | source is de and has es translation
| 6 | de | de | YES | both source and translation is de, missing es translation
| 7 | de | ja | YES | source is de, missing es translation
| 1 | en | none | YES | source not de
| 2 | null | es | YES | missing de translation
| 3 | null | de | NO | has de translation
| 4 | de | es | NO | source is de and has es translation
| 5 | de | de | NO | both source and translation is de, missing es translation
| 6 | null | none | YES | no detected locale nor translation
=end

[posts_1, posts_2, posts_3].flatten.each do |post|
post.set_translation("es", "hola")
post.set_translation("de", "hallo")
end
[posts_1, posts_2, posts_3].flatten.each { |post| post.set_detected_locale("de") }

post_1.set_detected_locale("en")
post_2.set_detected_locale("es")
post_4.set_detected_locale("de")
post_5.set_detected_locale("de")
post_6.set_detected_locale("de")
post_7.set_detected_locale("de")

post_3.set_translation("es", "hola")
post_4.set_translation("de", "hallo")
post_2.set_translation("es", "hola")
post_3.set_translation("de", "hallo")
post_4.set_translation("es", "hola")
post_5.set_translation("es", "hola")
post_6.set_translation("de", "hallo")
post_7.set_translation("ja", "こんにちは")
post_5.set_translation("de", "hallo")
end

it "returns correct post ids needing translation in descending updated_at" do
# based on the table above, we will return post_7, post_6, post_3, post_2, post_1
# based on the table above, we will return post_6, post_2, post_1
# but we will jumble its updated_at to test if it is sorted correctly
post_6.update!(updated_at: 1.day.ago)
post_3.update!(updated_at: 2.days.ago)
post_1.update!(updated_at: 2.days.ago)
post_2.update!(updated_at: 3.days.ago)
post_1.update!(updated_at: 4.days.ago)
post_7.update!(updated_at: 5.days.ago)

result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es])
expect(result).to include(post_6.id, post_3.id, post_2.id, post_1.id, post_7.id)
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
expect(result).to include(post_6.id, post_1.id, post_2.id)
end

it "does not return posts that are deleted" do
post_1.trash!
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es])
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
expect(result).not_to include(post_1.id)
end

it "does not return posts that are empty" do
post_1.cooked = ""
post_1.save!(validate: false)
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es])
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
expect(result).not_to include(post_1.id)
end

it "does not return posts by bots" do
post_1.update(user: Discourse.system_user)

result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es])
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")

expect(result).not_to include(post_1.id)
end
Expand Down
Loading