Skip to content

Commit 227dc50

Browse files
committed
DEV: Backfill per locale per model instead of just per model
1 parent 9aba526 commit 227dc50

File tree

2 files changed

+80
-85
lines changed

2 files changed

+80
-85
lines changed

app/jobs/scheduled/automatic_translation_backfill.rb

Lines changed: 60 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -18,35 +18,33 @@ def execute(args = nil)
1818
end
1919
end
2020

21-
def fetch_untranslated_model_ids(
22-
model,
23-
content_column,
24-
limit,
25-
target_locales = backfill_locales
26-
)
21+
def fetch_untranslated_model_ids(model, content_column, limit, target_locale)
2722
m = model.name.downcase
28-
DB.query_single(<<~SQL, target_locales: target_locales, limit: limit)
23+
DB.query_single(<<~SQL, target_locale: target_locale, limit: limit)
2924
SELECT m.id
30-
FROM #{m}s m
31-
LEFT JOIN discourse_translator_#{m}_locales dl ON dl.#{m}_id = m.id
32-
LEFT JOIN LATERAL (
33-
SELECT array_agg(DISTINCT locale)::text[] as locales
34-
FROM discourse_translator_#{m}_translations dt
35-
WHERE dt.#{m}_id = m.id
36-
) translations ON true
37-
WHERE NOT (
38-
ARRAY[:target_locales]::text[] <@
39-
(COALESCE(
40-
array_cat(
41-
ARRAY[COALESCE(dl.detected_locale, '')]::text[],
42-
COALESCE(translations.locales, ARRAY[]::text[])
43-
),
44-
ARRAY[]::text[]
45-
))
46-
)
47-
AND m.deleted_at IS NULL
48-
AND m.#{content_column} != ''
49-
AND m.user_id > 0
25+
FROM #{model.table_name} m
26+
WHERE m.deleted_at IS NULL
27+
AND m.#{content_column} != ''
28+
AND m.user_id > 0
29+
AND (
30+
NOT EXISTS (
31+
SELECT 1
32+
FROM discourse_translator_#{m}_locales
33+
WHERE #{m}_id = m.id
34+
)
35+
OR EXISTS (
36+
SELECT 1
37+
FROM discourse_translator_#{m}_locales
38+
WHERE #{m}_id = m.id
39+
AND detected_locale != :target_locale
40+
)
41+
)
42+
AND NOT EXISTS (
43+
SELECT 1
44+
FROM discourse_translator_#{m}_translations
45+
WHERE #{m}_id = m.id
46+
AND locale = :target_locale
47+
)
5048
ORDER BY m.updated_at DESC
5149
LIMIT :limit
5250
SQL
@@ -65,53 +63,60 @@ def secure_backfill_lock
6563
end
6664

6765
def translations_per_run
68-
[
69-
(SiteSetting.automatic_translation_backfill_maximum_translations_per_hour / 12) /
70-
backfill_locales.size,
71-
1,
72-
].max
66+
[(SiteSetting.automatic_translation_backfill_maximum_translations_per_hour / 12), 1].max
7367
end
7468

7569
def backfill_locales
76-
@backfill_locales ||= SiteSetting.automatic_translation_target_languages.split("|")
70+
@backfill_locales ||=
71+
SiteSetting.automatic_translation_target_languages.split("|").map { |l| l.gsub("_", "-") }
7772
end
7873

7974
def translator
8075
@translator_klass ||= "DiscourseTranslator::#{SiteSetting.translator_provider}".constantize
8176
end
8277

83-
def translate_records(type, record_ids)
78+
def translate_records(type, record_ids, target_locale)
8479
record_ids.each do |id|
8580
record = type.find(id)
86-
backfill_locales.each do |target_locale|
87-
begin
88-
translator.translate(record, target_locale.to_sym)
89-
rescue => e
90-
# continue with other locales even if one fails
91-
Rails.logger.warn(
92-
"Failed to machine-translate #{type.name}##{id} to #{target_locale}: #{e.message}\n#{e.backtrace.join("\n")}",
93-
)
94-
next
95-
end
81+
begin
82+
translator.translate(record, target_locale.to_sym)
83+
rescue => e
84+
# continue with other locales even if one fails
85+
Rails.logger.warn(
86+
"Failed to machine-translate #{type.name}##{id} to #{target_locale}: #{e.message}\n#{e.backtrace.join("\n")}",
87+
)
88+
next
9689
end
9790
end
9891
end
9992

10093
def process_batch
10194
models_translated = [Post, Topic].size
102-
translations_per_model = [translations_per_run / models_translated, 1].max
103-
topic_ids = fetch_untranslated_model_ids(Topic, "title", translations_per_model)
104-
translations_per_model = translations_per_run - topic_ids.size
105-
post_ids = fetch_untranslated_model_ids(Post, "cooked", translations_per_model)
106-
107-
DiscourseTranslator::VerboseLogger.log(
108-
"Translating #{topic_ids.size} topics and #{post_ids.size} posts to #{backfill_locales.join(", ")}",
109-
)
95+
avg_translations_per_model_per_language = [
96+
translations_per_run / models_translated / backfill_locales.size,
97+
1,
98+
].max
11099

111-
return if topic_ids.empty? && post_ids.empty?
100+
records_to_translate = avg_translations_per_model_per_language
101+
backfill_locales.each_with_index do |target_locale, i|
102+
topic_ids =
103+
fetch_untranslated_model_ids(Topic, "title", records_to_translate, target_locale)
104+
post_ids = fetch_untranslated_model_ids(Post, "cooked", records_to_translate, target_locale)
105+
106+
# if we end up translating fewer records than records_to_translate,
107+
# add to the value so that the next locales can have more quota
108+
records_to_translate =
109+
avg_translations_per_model_per_language +
110+
((records_to_translate - topic_ids.size - post_ids.size) / backfill_locales.size - i)
111+
next if topic_ids.empty? && post_ids.empty?
112+
113+
DiscourseTranslator::VerboseLogger.log(
114+
"Translating #{topic_ids.size} topics and #{post_ids.size} posts to #{backfill_locales.join(", ")}",
115+
)
112116

113-
translate_records(Topic, topic_ids)
114-
translate_records(Post, post_ids)
117+
translate_records(Topic, topic_ids, target_locale)
118+
translate_records(Post, post_ids, target_locale)
119+
end
115120
end
116121
end
117122
end

spec/jobs/automatic_translation_backfill_spec.rb

Lines changed: 20 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ def expect_google_translate(text)
9696
topic.set_detected_locale("de")
9797
post.set_detected_locale("es")
9898

99-
expect_google_translate("hallo")
10099
expect_google_translate("hola")
100+
expect_google_translate("hallo")
101101

102102
described_class.new.execute
103103

@@ -108,7 +108,7 @@ def expect_google_translate(text)
108108

109109
describe "with just one locale ['de']" do
110110
before do
111-
SiteSetting.automatic_translation_backfill_maximum_translations_per_hour = 5 * 12
111+
SiteSetting.automatic_translation_backfill_maximum_translations_per_hour = 100
112112
SiteSetting.automatic_translation_target_languages = "de"
113113
expect_google_check_language
114114
end
@@ -149,64 +149,54 @@ def expect_google_translate(text)
149149
This is the scenario we are testing for:
150150
| Post ID | detected_locale | translations | selected? | Why? |
151151
|---------|-----------------|--------------|-----------|------|
152-
| 1 | en | none | YES | source not de/es, needs both translations
153-
| 2 | es | none | YES | source is es, but missing de translation
154-
| 3 | null | es | YES | missing de translation
155-
| 4 | null | de, es | NO | has both de and es translations
156-
| 5 | de | es | NO | source is de and has es translation
157-
| 6 | de | de | YES | both source and translation is de, missing es translation
158-
| 7 | de | ja | YES | source is de, missing es translation
152+
| 1 | en | none | YES | source not de
153+
| 2 | null | es | YES | missing de translation
154+
| 3 | null | de | NO | has de translation
155+
| 4 | de | es | NO | source is de and has es translation
156+
| 5 | de | de | NO | both source and translation is de, missing es translation
157+
| 6 | null | none | YES | no detected locale nor translation
159158
=end
160159

161-
[posts_1, posts_2, posts_3].flatten.each do |post|
162-
post.set_translation("es", "hola")
163-
post.set_translation("de", "hallo")
164-
end
160+
[posts_1, posts_2, posts_3].flatten.each { |post| post.set_detected_locale("de") }
165161

166162
post_1.set_detected_locale("en")
167-
post_2.set_detected_locale("es")
163+
post_4.set_detected_locale("de")
168164
post_5.set_detected_locale("de")
169-
post_6.set_detected_locale("de")
170-
post_7.set_detected_locale("de")
171165

172-
post_3.set_translation("es", "hola")
173-
post_4.set_translation("de", "hallo")
166+
post_2.set_translation("es", "hola")
167+
post_3.set_translation("de", "hallo")
174168
post_4.set_translation("es", "hola")
175-
post_5.set_translation("es", "hola")
176-
post_6.set_translation("de", "hallo")
177-
post_7.set_translation("ja", "こんにちは")
169+
post_5.set_translation("de", "hallo")
178170
end
179171

180172
it "returns correct post ids needing translation in descending updated_at" do
181-
# based on the table above, we will return post_7, post_6, post_3, post_2, post_1
173+
# based on the table above, we will return post_6, post_2, post_1
182174
# but we will jumble its updated_at to test if it is sorted correctly
183175
post_6.update!(updated_at: 1.day.ago)
184-
post_3.update!(updated_at: 2.days.ago)
176+
post_1.update!(updated_at: 2.days.ago)
185177
post_2.update!(updated_at: 3.days.ago)
186-
post_1.update!(updated_at: 4.days.ago)
187-
post_7.update!(updated_at: 5.days.ago)
188178

189-
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es])
190-
expect(result).to include(post_6.id, post_3.id, post_2.id, post_1.id, post_7.id)
179+
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
180+
expect(result).to include(post_6.id, post_1.id, post_2.id)
191181
end
192182

193183
it "does not return posts that are deleted" do
194184
post_1.trash!
195-
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es])
185+
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
196186
expect(result).not_to include(post_1.id)
197187
end
198188

199189
it "does not return posts that are empty" do
200190
post_1.cooked = ""
201191
post_1.save!(validate: false)
202-
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es])
192+
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
203193
expect(result).not_to include(post_1.id)
204194
end
205195

206196
it "does not return posts by bots" do
207197
post_1.update(user: Discourse.system_user)
208198

209-
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es])
199+
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
210200

211201
expect(result).not_to include(post_1.id)
212202
end

0 commit comments

Comments
 (0)