diff --git a/app/jobs/scheduled/embeddings_backfill.rb b/app/jobs/scheduled/embeddings_backfill.rb index db0828f2f..c7af4e9fc 100644 --- a/app/jobs/scheduled/embeddings_backfill.rb +++ b/app/jobs/scheduled/embeddings_backfill.rb @@ -70,6 +70,7 @@ def execute(args) Post .joins("LEFT JOIN #{table_name} ON #{table_name}.post_id = posts.id") .where(deleted_at: nil) + .where(post_type: Post.types[:regular]) .limit(limit - rebaked) # First, we'll try to backfill embeddings for posts that have none diff --git a/lib/embeddings/vector_representations/base.rb b/lib/embeddings/vector_representations/base.rb index e1f3ff497..7055ea231 100644 --- a/lib/embeddings/vector_representations/base.rb +++ b/lib/embeddings/vector_representations/base.rb @@ -61,18 +61,21 @@ def gen_bulk_reprensentations(relation) embedding_gen = inference_client promised_embeddings = - relation.map do |record| - materials = { target: record, text: prepare_text(record) } - - Concurrent::Promises - .fulfilled_future(materials, pool) - .then_on(pool) do |w_prepared_text| - w_prepared_text.merge( - embedding: embedding_gen.perform!(w_prepared_text[:text]), - digest: OpenSSL::Digest::SHA1.hexdigest(w_prepared_text[:text]), - ) - end - end + relation + .map do |record| + prepared_text = prepare_text(record) + next if prepared_text.blank? + + Concurrent::Promises + .fulfilled_future({ target: record, text: prepared_text }, pool) + .then_on(pool) do |w_prepared_text| + w_prepared_text.merge( + embedding: embedding_gen.perform!(w_prepared_text[:text]), + digest: OpenSSL::Digest::SHA1.hexdigest(w_prepared_text[:text]), + ) + end + end + .compact Concurrent::Promises .zip(*promised_embeddings) diff --git a/lib/embeddings/vector_representations/multilingual_e5_large.rb b/lib/embeddings/vector_representations/multilingual_e5_large.rb index 605ec8b55..fe611ec97 100644 --- a/lib/embeddings/vector_representations/multilingual_e5_large.rb +++ b/lib/embeddings/vector_representations/multilingual_e5_large.rb @@ -81,11 +81,13 @@ def inference_client end def prepare_text(record) - if inference_client.class.name.include?("DiscourseClassifier") - return "query: #{super(record)}" + prepared_text = super(record) + + if prepared_text.present? && inference_client.class.name.include?("DiscourseClassifier") + return "query: #{prepared_text}" end - super(record) + prepared_text end end end diff --git a/spec/lib/modules/embeddings/vector_representations/vector_rep_shared_examples.rb b/spec/lib/modules/embeddings/vector_representations/vector_rep_shared_examples.rb index fce9f6123..075e6930b 100644 --- a/spec/lib/modules/embeddings/vector_representations/vector_rep_shared_examples.rb +++ b/spec/lib/modules/embeddings/vector_representations/vector_rep_shared_examples.rb @@ -79,6 +79,10 @@ expect(vector_rep.topic_id_from_representation(expected_embedding_1)).to eq(topic.id) expect(vector_rep.topic_id_from_representation(expected_embedding_1)).to eq(topic.id) end + + it "does nothing if passed record has no content" do + expect { vector_rep.gen_bulk_reprensentations([Topic.new]) }.not_to raise_error + end end describe "#asymmetric_topics_similarity_search" do