Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit f8edcec

Browse files
committed
FIX: Skip records without content to classify
1 parent ddf2bf7 commit f8edcec

File tree

4 files changed

+25
-15
lines changed

4 files changed

+25
-15
lines changed

app/jobs/scheduled/embeddings_backfill.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ def execute(args)
7070
Post
7171
.joins("LEFT JOIN #{table_name} ON #{table_name}.post_id = posts.id")
7272
.where(deleted_at: nil)
73+
.where(post_type: Post.types[:regular])
7374
.limit(limit - rebaked)
7475

7576
# First, we'll try to backfill embeddings for posts that have none

lib/embeddings/vector_representations/base.rb

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -61,18 +61,21 @@ def gen_bulk_reprensentations(relation)
6161

6262
embedding_gen = inference_client
6363
promised_embeddings =
64-
relation.map do |record|
65-
materials = { target: record, text: prepare_text(record) }
66-
67-
Concurrent::Promises
68-
.fulfilled_future(materials, pool)
69-
.then_on(pool) do |w_prepared_text|
70-
w_prepared_text.merge(
71-
embedding: embedding_gen.perform!(w_prepared_text[:text]),
72-
digest: OpenSSL::Digest::SHA1.hexdigest(w_prepared_text[:text]),
73-
)
74-
end
75-
end
64+
relation
65+
.map do |record|
66+
prepared_text = prepare_text(record)
67+
next if prepared_text.blank?
68+
69+
Concurrent::Promises
70+
.fulfilled_future({ target: record, text: prepared_text }, pool)
71+
.then_on(pool) do |w_prepared_text|
72+
w_prepared_text.merge(
73+
embedding: embedding_gen.perform!(w_prepared_text[:text]),
74+
digest: OpenSSL::Digest::SHA1.hexdigest(w_prepared_text[:text]),
75+
)
76+
end
77+
end
78+
.compact
7679

7780
Concurrent::Promises
7881
.zip(*promised_embeddings)

lib/embeddings/vector_representations/multilingual_e5_large.rb

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,11 +81,13 @@ def inference_client
8181
end
8282

8383
def prepare_text(record)
84-
if inference_client.class.name.include?("DiscourseClassifier")
85-
return "query: #{super(record)}"
84+
prepared_text = super(record)
85+
86+
if prepared_text.present? && inference_client.class.name.include?("DiscourseClassifier")
87+
return "query: #{prepared_text}"
8688
end
8789

88-
super(record)
90+
prepared_text
8991
end
9092
end
9193
end

spec/lib/modules/embeddings/vector_representations/vector_rep_shared_examples.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,10 @@
7979
expect(vector_rep.topic_id_from_representation(expected_embedding_1)).to eq(topic.id)
8080
expect(vector_rep.topic_id_from_representation(expected_embedding_1)).to eq(topic.id)
8181
end
82+
83+
it "does nothing if passed record has no content" do
84+
expect { vector_rep.gen_bulk_reprensentations([Topic.new]) }.not_to raise_error
85+
end
8286
end
8387

8488
describe "#asymmetric_topics_similarity_search" do

0 commit comments

Comments
 (0)