Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit 0eeda36

Browse files
committed
FIX: Add a digest check to avoid repeteadly generating embeddings (bulk)
1 parent d6beac4 commit 0eeda36

File tree

2 files changed

+61
-29
lines changed

2 files changed

+61
-29
lines changed

lib/embeddings/vector_representations/base.rb

Lines changed: 35 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,16 @@ def gen_bulk_reprensentations(relation)
6666
prepared_text = prepare_text(record)
6767
next if prepared_text.blank?
6868

69+
new_digest = OpenSSL::Digest::SHA1.hexdigest(prepared_text)
70+
next if find_digest_of(record) == new_digest
71+
6972
Concurrent::Promises
70-
.fulfilled_future({ target: record, text: prepared_text }, pool)
73+
.fulfilled_future(
74+
{ target: record, text: prepared_text, digest: new_digest },
75+
pool,
76+
)
7177
.then_on(pool) do |w_prepared_text|
72-
w_prepared_text.merge(
73-
embedding: embedding_gen.perform!(w_prepared_text[:text]),
74-
digest: OpenSSL::Digest::SHA1.hexdigest(w_prepared_text[:text]),
75-
)
78+
w_prepared_text.merge(embedding: embedding_gen.perform!(w_prepared_text[:text]))
7679
end
7780
end
7881
.compact
@@ -90,31 +93,8 @@ def generate_representation_from(target, persist: true)
9093
text = prepare_text(target)
9194
return if text.blank?
9295

93-
target_column =
94-
case target
95-
when Topic
96-
"topic_id"
97-
when Post
98-
"post_id"
99-
when RagDocumentFragment
100-
"rag_document_fragment_id"
101-
else
102-
raise ArgumentError, "Invalid target type"
103-
end
104-
10596
new_digest = OpenSSL::Digest::SHA1.hexdigest(text)
106-
current_digest = DB.query_single(<<~SQL, target_id: target.id).first
107-
SELECT
108-
digest
109-
FROM
110-
#{table_name(target)}
111-
WHERE
112-
model_id = #{id} AND
113-
strategy_id = #{@strategy.id} AND
114-
#{target_column} = :target_id
115-
LIMIT 1
116-
SQL
117-
return if current_digest == new_digest
97+
return if find_digest_of(target) == new_digest
11898

11999
vector = vector_from(text)
120100

@@ -412,6 +392,32 @@ def asymmetric_query_prefix
412392

413393
protected
414394

395+
def find_digest_of(target)
396+
target_column =
397+
case target
398+
when Topic
399+
"topic_id"
400+
when Post
401+
"post_id"
402+
when RagDocumentFragment
403+
"rag_document_fragment_id"
404+
else
405+
raise ArgumentError, "Invalid target type"
406+
end
407+
408+
DB.query_single(<<~SQL, target_id: target.id).first
409+
SELECT
410+
digest
411+
FROM
412+
#{table_name(target)}
413+
WHERE
414+
model_id = #{id} AND
415+
strategy_id = #{@strategy.id} AND
416+
#{target_column} = :target_id
417+
LIMIT 1
418+
SQL
419+
end
420+
415421
def save_to_db(target, vector, digest)
416422
if target.is_a?(Topic)
417423
DB.exec(

spec/lib/modules/embeddings/vector_representations/vector_rep_shared_examples.rb

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,32 @@
8383
it "does nothing if passed record has no content" do
8484
expect { vector_rep.gen_bulk_reprensentations([Topic.new]) }.not_to raise_error
8585
end
86+
87+
it "doesn't ask for a new embedding if digest is the same" do
88+
text =
89+
truncation.prepare_text_from(
90+
topic,
91+
vector_rep.tokenizer,
92+
vector_rep.max_sequence_length - 2,
93+
)
94+
stub_vector_mapping(text, expected_embedding_1)
95+
96+
original_vector_gen = Time.zone.parse("2021-06-04 10:00")
97+
98+
freeze_time(original_vector_gen) do
99+
vector_rep.gen_bulk_reprensentations(Topic.where(id: [topic.id]))
100+
end
101+
# check vector exists
102+
expect(vector_rep.topic_id_from_representation(expected_embedding_1)).to eq(topic.id)
103+
104+
vector_rep.gen_bulk_reprensentations(Topic.where(id: [topic.id]))
105+
last_update =
106+
DB.query_single(
107+
"SELECT updated_at FROM #{vector_rep.topic_table_name} WHERE topic_id = #{topic.id} LIMIT 1",
108+
).first
109+
110+
expect(last_update).to eq(original_vector_gen)
111+
end
86112
end
87113

88114
describe "#asymmetric_topics_similarity_search" do

0 commit comments

Comments
 (0)