@@ -12,21 +12,80 @@ class Schema
1212 POSTS_TABLE = "ai_posts_embeddings"
1313 RAG_DOCS_TABLE = "ai_document_fragments_embeddings"
1414
15+ EMBEDDING_TARGETS = %w[ topics posts document_fragments ]
16+ EMBEDDING_TABLES = [ TOPICS_TABLE , POSTS_TABLE , RAG_DOCS_TABLE ]
17+
1518 MissingEmbeddingError = Class . new ( StandardError )
1619
17- def self . for ( target_klass )
18- vector_def = EmbeddingDefinition . find_by ( id : SiteSetting . ai_embeddings_selected_model )
19- raise "Invalid embeddings selected model" if vector_def . nil?
20-
21- case target_klass &.name
22- when "Topic"
23- new ( TOPICS_TABLE , "topic_id" , vector_def )
24- when "Post"
25- new ( POSTS_TABLE , "post_id" , vector_def )
26- when "RagDocumentFragment"
27- new ( RAG_DOCS_TABLE , "rag_document_fragment_id" , vector_def )
28- else
29- raise ArgumentError , "Invalid target type for embeddings"
20+ class << self
21+ def for ( target_klass )
22+ vector_def = EmbeddingDefinition . find_by ( id : SiteSetting . ai_embeddings_selected_model )
23+ raise "Invalid embeddings selected model" if vector_def . nil?
24+
25+ case target_klass &.name
26+ when "Topic"
27+ new ( TOPICS_TABLE , "topic_id" , vector_def )
28+ when "Post"
29+ new ( POSTS_TABLE , "post_id" , vector_def )
30+ when "RagDocumentFragment"
31+ new ( RAG_DOCS_TABLE , "rag_document_fragment_id" , vector_def )
32+ else
33+ raise ArgumentError , "Invalid target type for embeddings"
34+ end
35+ end
36+
37+ def search_index_name ( table , def_id )
38+ "ai_#{ table } _embeddings_#{ def_id } _1_search_bit"
39+ end
40+
41+ def prepare_search_indexes ( vector_def )
42+ EMBEDDING_TARGETS . each { |target | DB . exec <<~SQL }
43+ CREATE INDEX IF NOT EXISTS #{ search_index_name ( target , vector_def . id ) } ON ai_#{ target } _embeddings
44+ USING hnsw ((binary_quantize(embeddings)::bit(#{ vector_def . dimensions } )) bit_hamming_ops)
45+ WHERE model_id = #{ vector_def . id } AND strategy_id = 1;
46+ SQL
47+ end
48+
49+ def correctly_indexed? ( vector_def )
50+ index_names = EMBEDDING_TARGETS . map { |t | search_index_name ( t , vector_def . id ) }
51+ indexdefs =
52+ DB . query_single (
53+ "SELECT indexdef FROM pg_indexes WHERE indexname IN (:names)" ,
54+ names : index_names ,
55+ )
56+
57+ return false if indexdefs . length < index_names . length
58+
59+ indexdefs . all? do |defs |
60+ defs . include? "(binary_quantize(embeddings))::bit(#{ vector_def . dimensions } )"
61+ end
62+ end
63+
64+ def remove_orphaned_data
65+ removed_defs_ids =
66+ DB . query_single (
67+ "SELECT DISTINCT(model_id) FROM #{ TOPICS_TABLE } te LEFT JOIN embedding_definitions ed ON te.model_id = ed.id WHERE ed.id IS NULL" ,
68+ )
69+
70+ EMBEDDING_TABLES . each do |t |
71+ DB . exec (
72+ "DELETE FROM #{ t } WHERE model_id IN (:removed_defs)" ,
73+ removed_defs : removed_defs_ids ,
74+ )
75+ end
76+
77+ drop_index_statement =
78+ EMBEDDING_TARGETS
79+ . reduce ( [ ] ) do |memo , et |
80+ removed_defs_ids . each do |rdi |
81+ memo << "DROP INDEX IF EXISTS #{ search_index_name ( et , rdi ) } ;"
82+ end
83+
84+ memo
85+ end
86+ . join ( "\n " )
87+
88+ DB . exec ( drop_index_statement )
3089 end
3190 end
3291
0 commit comments