Skip to content

Commit c953291

Browse files
authored
DEV: Backfill per locale per model instead of just per model (#243)
Currently backfilling very inefficiently, as we were querying by a single Topic or Post as long as it does not have any of the target languages (based on an array rather than a single locale). This commit splits the backfill job into per locale rather than grouping all the locales together and finding which topics have missing translations. e.g. from the log item below, you can sort of tell that 18 topics are "shared" between all the languages, even if the language already has the translation. ``` DiscourseTranslator: Translating 18 topics and 19 posts to en, zh_CN, es, fr, de, pt_BR, it, ar, he ``` It does potentially end up loading a topic multiple times, but the alternative is a much slower backfill if new languages get added to the target languages.
1 parent d371bb3 commit c953291

File tree

2 files changed

+80
-85
lines changed

2 files changed

+80
-85
lines changed

app/jobs/scheduled/automatic_translation_backfill.rb

Lines changed: 60 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -18,35 +18,33 @@ def execute(args = nil)
1818
end
1919
end
2020

21-
def fetch_untranslated_model_ids(
22-
model,
23-
content_column,
24-
limit,
25-
target_locales = backfill_locales
26-
)
21+
def fetch_untranslated_model_ids(model, content_column, limit, target_locale)
2722
m = model.name.downcase
28-
DB.query_single(<<~SQL, target_locales: target_locales, limit: limit)
23+
DB.query_single(<<~SQL, target_locale: target_locale, limit: limit)
2924
SELECT m.id
30-
FROM #{m}s m
31-
LEFT JOIN discourse_translator_#{m}_locales dl ON dl.#{m}_id = m.id
32-
LEFT JOIN LATERAL (
33-
SELECT array_agg(DISTINCT locale)::text[] as locales
34-
FROM discourse_translator_#{m}_translations dt
35-
WHERE dt.#{m}_id = m.id
36-
) translations ON true
37-
WHERE NOT (
38-
ARRAY[:target_locales]::text[] <@
39-
(COALESCE(
40-
array_cat(
41-
ARRAY[COALESCE(dl.detected_locale, '')]::text[],
42-
COALESCE(translations.locales, ARRAY[]::text[])
43-
),
44-
ARRAY[]::text[]
45-
))
46-
)
47-
AND m.deleted_at IS NULL
48-
AND m.#{content_column} != ''
49-
AND m.user_id > 0
25+
FROM #{model.table_name} m
26+
WHERE m.deleted_at IS NULL
27+
AND m.#{content_column} != ''
28+
AND m.user_id > 0
29+
AND (
30+
NOT EXISTS (
31+
SELECT 1
32+
FROM discourse_translator_#{m}_locales
33+
WHERE #{m}_id = m.id
34+
)
35+
OR EXISTS (
36+
SELECT 1
37+
FROM discourse_translator_#{m}_locales
38+
WHERE #{m}_id = m.id
39+
AND detected_locale != :target_locale
40+
)
41+
)
42+
AND NOT EXISTS (
43+
SELECT 1
44+
FROM discourse_translator_#{m}_translations
45+
WHERE #{m}_id = m.id
46+
AND locale = :target_locale
47+
)
5048
ORDER BY m.updated_at DESC
5149
LIMIT :limit
5250
SQL
@@ -65,53 +63,60 @@ def secure_backfill_lock
6563
end
6664

6765
def translations_per_run
68-
[
69-
(SiteSetting.automatic_translation_backfill_maximum_translations_per_hour / 12) /
70-
backfill_locales.size,
71-
1,
72-
].max
66+
[(SiteSetting.automatic_translation_backfill_maximum_translations_per_hour / 12), 1].max
7367
end
7468

7569
def backfill_locales
76-
@backfill_locales ||= SiteSetting.automatic_translation_target_languages.split("|")
70+
@backfill_locales ||=
71+
SiteSetting.automatic_translation_target_languages.split("|").map { |l| l.gsub("_", "-") }
7772
end
7873

7974
def translator
8075
@translator_klass ||= "DiscourseTranslator::#{SiteSetting.translator_provider}".constantize
8176
end
8277

83-
def translate_records(type, record_ids)
78+
def translate_records(type, record_ids, target_locale)
8479
record_ids.each do |id|
8580
record = type.find(id)
86-
backfill_locales.each do |target_locale|
87-
begin
88-
translator.translate(record, target_locale.to_sym)
89-
rescue => e
90-
# continue with other locales even if one fails
91-
Rails.logger.warn(
92-
"Failed to machine-translate #{type.name}##{id} to #{target_locale}: #{e.message}\n#{e.backtrace.join("\n")}",
93-
)
94-
next
95-
end
81+
begin
82+
translator.translate(record, target_locale.to_sym)
83+
rescue => e
84+
# continue with other locales even if one fails
85+
Rails.logger.warn(
86+
"Failed to machine-translate #{type.name}##{id} to #{target_locale}: #{e.message}\n#{e.backtrace.join("\n")}",
87+
)
88+
next
9689
end
9790
end
9891
end
9992

10093
def process_batch
10194
models_translated = [Post, Topic].size
102-
translations_per_model = [translations_per_run / models_translated, 1].max
103-
topic_ids = fetch_untranslated_model_ids(Topic, "title", translations_per_model)
104-
translations_per_model = translations_per_run - topic_ids.size
105-
post_ids = fetch_untranslated_model_ids(Post, "cooked", translations_per_model)
106-
107-
DiscourseTranslator::VerboseLogger.log(
108-
"Translating #{topic_ids.size} topics and #{post_ids.size} posts to #{backfill_locales.join(", ")}",
109-
)
95+
avg_translations_per_model_per_language = [
96+
translations_per_run / models_translated / backfill_locales.size,
97+
1,
98+
].max
11099

111-
return if topic_ids.empty? && post_ids.empty?
100+
records_to_translate = avg_translations_per_model_per_language
101+
backfill_locales.each_with_index do |target_locale, i|
102+
topic_ids =
103+
fetch_untranslated_model_ids(Topic, "title", records_to_translate, target_locale)
104+
post_ids = fetch_untranslated_model_ids(Post, "cooked", records_to_translate, target_locale)
105+
106+
# if we end up translating fewer records than records_to_translate,
107+
# add to the value so that the next locales can have more quota
108+
records_to_translate =
109+
avg_translations_per_model_per_language +
110+
((records_to_translate - topic_ids.size - post_ids.size) / backfill_locales.size - i)
111+
next if topic_ids.empty? && post_ids.empty?
112+
113+
DiscourseTranslator::VerboseLogger.log(
114+
"Translating #{topic_ids.size} topics and #{post_ids.size} posts to #{backfill_locales.join(", ")}",
115+
)
112116

113-
translate_records(Topic, topic_ids)
114-
translate_records(Post, post_ids)
117+
translate_records(Topic, topic_ids, target_locale)
118+
translate_records(Post, post_ids, target_locale)
119+
end
115120
end
116121
end
117122
end

spec/jobs/automatic_translation_backfill_spec.rb

Lines changed: 20 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ def expect_google_translate(text)
9696
topic.set_detected_locale("de")
9797
post.set_detected_locale("es")
9898

99-
expect_google_translate("hallo")
10099
expect_google_translate("hola")
100+
expect_google_translate("hallo")
101101

102102
described_class.new.execute
103103

@@ -108,7 +108,7 @@ def expect_google_translate(text)
108108

109109
describe "with just one locale ['de']" do
110110
before do
111-
SiteSetting.automatic_translation_backfill_maximum_translations_per_hour = 5 * 12
111+
SiteSetting.automatic_translation_backfill_maximum_translations_per_hour = 100
112112
SiteSetting.automatic_translation_target_languages = "de"
113113
expect_google_check_language
114114
end
@@ -149,64 +149,54 @@ def expect_google_translate(text)
149149
This is the scenario we are testing for:
150150
| Post ID | detected_locale | translations | selected? | Why? |
151151
|---------|-----------------|--------------|-----------|------|
152-
| 1 | en | none | YES | source not de/es, needs both translations
153-
| 2 | es | none | YES | source is es, but missing de translation
154-
| 3 | null | es | YES | missing de translation
155-
| 4 | null | de, es | NO | has both de and es translations
156-
| 5 | de | es | NO | source is de and has es translation
157-
| 6 | de | de | YES | both source and translation is de, missing es translation
158-
| 7 | de | ja | YES | source is de, missing es translation
152+
| 1 | en | none | YES | source not de
153+
| 2 | null | es | YES | missing de translation
154+
| 3 | null | de | NO | has de translation
155+
| 4 | de | es | NO | source is de and has es translation
156+
| 5 | de | de | NO | both source and translation is de, missing es translation
157+
| 6 | null | none | YES | no detected locale nor translation
159158
=end
160159

161-
[posts_1, posts_2, posts_3].flatten.each do |post|
162-
post.set_translation("es", "hola")
163-
post.set_translation("de", "hallo")
164-
end
160+
[posts_1, posts_2, posts_3].flatten.each { |post| post.set_detected_locale("de") }
165161

166162
post_1.set_detected_locale("en")
167-
post_2.set_detected_locale("es")
163+
post_4.set_detected_locale("de")
168164
post_5.set_detected_locale("de")
169-
post_6.set_detected_locale("de")
170-
post_7.set_detected_locale("de")
171165

172-
post_3.set_translation("es", "hola")
173-
post_4.set_translation("de", "hallo")
166+
post_2.set_translation("es", "hola")
167+
post_3.set_translation("de", "hallo")
174168
post_4.set_translation("es", "hola")
175-
post_5.set_translation("es", "hola")
176-
post_6.set_translation("de", "hallo")
177-
post_7.set_translation("ja", "こんにちは")
169+
post_5.set_translation("de", "hallo")
178170
end
179171

180172
it "returns correct post ids needing translation in descending updated_at" do
181-
# based on the table above, we will return post_7, post_6, post_3, post_2, post_1
173+
# based on the table above, we will return post_6, post_2, post_1
182174
# but we will jumble its updated_at to test if it is sorted correctly
183175
post_6.update!(updated_at: 1.day.ago)
184-
post_3.update!(updated_at: 2.days.ago)
176+
post_1.update!(updated_at: 2.days.ago)
185177
post_2.update!(updated_at: 3.days.ago)
186-
post_1.update!(updated_at: 4.days.ago)
187-
post_7.update!(updated_at: 5.days.ago)
188178

189-
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es])
190-
expect(result).to include(post_6.id, post_3.id, post_2.id, post_1.id, post_7.id)
179+
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
180+
expect(result).to include(post_6.id, post_1.id, post_2.id)
191181
end
192182

193183
it "does not return posts that are deleted" do
194184
post_1.trash!
195-
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es])
185+
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
196186
expect(result).not_to include(post_1.id)
197187
end
198188

199189
it "does not return posts that are empty" do
200190
post_1.cooked = ""
201191
post_1.save!(validate: false)
202-
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es])
192+
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
203193
expect(result).not_to include(post_1.id)
204194
end
205195

206196
it "does not return posts by bots" do
207197
post_1.update(user: Discourse.system_user)
208198

209-
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, %w[de es])
199+
result = described_class.new.fetch_untranslated_model_ids(Post, "cooked", 50, "de")
210200

211201
expect(result).not_to include(post_1.id)
212202
end

0 commit comments

Comments
 (0)