Merge pull request #1810 from vespa-engine/rag_blueprint_rank_profiles_explanation

radu-gheorghe · web-flow · commit 2723b24ce5cc · 2025-11-14T08:20:07.000+02:00
Rag blueprint rank profiles explanation
diff --git a/rag-blueprint/app/schemas/doc.sd b/rag-blueprint/app/schemas/doc.sd
@@ -73,7 +73,10 @@ schema doc {
         from-disk
         summary chunks_top3 {
             source: chunks
-            select-elements-by: top_3_chunk_sim_scores #this needs to be added a summary-feature to the rank-profile
+            # top_3_chunk_sim_scores needs to be added as a summary-feature to the rank-profile.
+            # It should be a tensor with a single mapped dimension (i.e., the chunk ID),
+            # like it is in base-features.profile
+            select-elements-by: top_3_chunk_sim_scores
         }
     }
 }
diff --git a/rag-blueprint/app/schemas/doc/base-features.profile b/rag-blueprint/app/schemas/doc/base-features.profile
@@ -4,47 +4,111 @@ rank-profile base-features {
             query(float_embedding) tensor<float>(x[768])
         }
 
+        # Fixed length chunking should not cause any positional gap between elements
+        # => lexical search (e.g., chunks contains 'hello world') works as if we had a big blob of text,
+        # (e.g., would match ["and she said hello", "world!"])
         rank chunks {
-            element-gap: 0 # Fixed length chunking should not cause any positional gap between elements
+            element-gap: 0
         }
+
+        # Creates a tensor with a single mapped dimension (i.e., the chunk ID)
+        # and the BM25 score for each chunk. Returns something like:
+        #{
+        #    "type": "tensor(chunk{})",
+        #    "cells": {
+        #        "2": 0.5112776,
+        #        "5": 1.1021805
+        #    }
+        #}
         function chunk_text_scores() {
             expression: elementwise(bm25(chunks),chunk,float)
         }
 
+        # Unpacks the 8 bits of each integer from every chunk into 8 floats representing their values
+        # effectively transforms tensor<int8>(chunk{}, x[96]) into tensor<float>(chunk{}, x[768]).
+        # This prepares the bit embeddings for dot product with the query embedding. Returns something like:
+        #{
+        #    "type": "tensor(chunk{})",
+        #    "cells": {
+        #        "2": [0.0, 1.0, 0.0, 0.0, 1.0, ... up to 768 floats],
+        #        "5": [1.0, 0.0, 1.0, 0.0, 1.0, ... up to 768 floats]
+        #    }
+        #}
         function chunk_emb_vecs() {
             expression: unpack_bits(attribute(chunk_embeddings))
         }
 
+        # Computes the dot product of the query embedding with each chunk embedding.
+        # Returns a tensor with a similarity score for each chunk ID. E.g.,
+        #{
+        #    "type": "tensor(chunk{})",
+        #    "cells": {
+        #        "2": 8.7,   # e.g., this is the dot product of the query embedding with the chunk embedding for chunk ID 2
+        #        "5": 1.2
+        #    }
+        #}
         function chunk_dot_prod() {
             expression: reduce(query(float_embedding) * chunk_emb_vecs(), sum, x)
         }
 
+
+        # Computes the magnitude (Euclidean norm) of all vectors in a tensor.
+        # We will use this to normalize (i.e., bring back to 0-1 range) the chunk dot product,
+        # which will tend to be higher for embeddings with more dimensions. This normalized
+        # dot product is the cosine similarity.
         function vector_norms(t) {
             expression: sqrt(sum(pow(t, 2), x))
         }
+
+        # Computes the cosine similarity between the query embedding and each chunk embedding,
+        # by dividing the dot product by the product of the magnitudes of the two vectors.
+        #
+        # Returns a tensor with a similarity score for each chunk ID. E.g.,
+        #{
+        #    "type": "tensor(chunk{})",
+        #    "cells": {
+        #        "2": 0.98,   # e.g., this is the cosine similarity between the query embedding and the chunk embedding for chunk ID 2
+        #        "5": 0.75    # notice how values are normalized to 0-1 range, unlike the dot product
+        #    }
+        #}
         function chunk_sim_scores() {
             expression: chunk_dot_prod() / (vector_norms(chunk_emb_vecs()) * vector_norms(query(float_embedding)))
         }
 
+        # Returns a tensor with the top 3 chunk IDs by their BM25 lexical scores. E.g.,
+        #{
+        #    "type": "tensor(chunk{})",
+        #    "cells": {
+        #        "3": 3.8021805,
+        #        "5": 1.1021805,
+        #        "2": 0.5112776
+        #    }
+        #}
         function top_3_chunk_text_scores() {
             expression: top(3, chunk_text_scores())
         }
 
+        # Returns a tensor with the top 3 chunk IDs by their cosine similarity scores.
         function top_3_chunk_sim_scores() {
             expression: top(3, chunk_sim_scores())
         }
 
+        # Returns the average of the top 3 chunks' BM25 scores.
         function avg_top_3_chunk_text_scores() {
             expression: reduce(top_3_chunk_text_scores(), avg, chunk)
         }
+
+        # Returns the average of the top 3 chunks' cosine similarity scores.
         function avg_top_3_chunk_sim_scores() {
             expression: reduce(top_3_chunk_sim_scores(), avg, chunk)
         }
         
+        # Returns the maximum of the chunk BM25 lexical scores.
         function max_chunk_text_scores() {
             expression: reduce(chunk_text_scores(), max, chunk)
         }
 
+        # Returns the maximum of the chunk cosine similarity scores.
         function max_chunk_sim_scores() {
             expression: reduce(chunk_sim_scores(), max, chunk)
         }
diff --git a/rag-blueprint/app/schemas/doc/collect-second-phase.profile b/rag-blueprint/app/schemas/doc/collect-second-phase.profile
@@ -10,17 +10,24 @@ rank-profile collect-second-phase inherits collect-training-data {
             is_favorite
             open_count
         }
+    
+    # When decaying scores by how old documents are (see below),
+    # get to 0.0 after 3 years, then remain at 0.0.
     rank-properties {
         freshness(modified_timestamp).maxAge: 94672800 # 3 years in seconds
     }
+    # If modified_timestamp is now, freshness is 1.0.
+    # Otherwise, decay linearly towards 0.0 at a rate of 1/3 per year (as configured above).
     function modified_freshness() {
         expression: freshness(modified_timestamp)
     }
 
+    # Returns 1.0 if the document is a favorite, 0.0 otherwise.
     function is_favorite() {
         expression: if(attribute(favorite), 1.0, 0.0)
     }
 
+    # Returns the value of the open_count field.
     function open_count() {
         expression: attribute(open_count)
     }

Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,10 @@ schema doc {`
`73`	`73`	`from-disk`
`74`	`74`	`summary chunks_top3 {`
`75`	`75`	`source: chunks`
`76`		`- select-elements-by: top_3_chunk_sim_scores #this needs to be added a summary-feature to the rank-profile`
	`76`	`+ # top_3_chunk_sim_scores needs to be added as a summary-feature to the rank-profile.`
	`77`	`+ # It should be a tensor with a single mapped dimension (i.e., the chunk ID),`
	`78`	`+ # like it is in base-features.profile`
	`79`	`+ select-elements-by: top_3_chunk_sim_scores`
`77`	`80`	`}`
`78`	`81`	`}`
`79`	`82`	`}`
Original file line number	Diff line number	Diff line change
`@@ -10,17 +10,24 @@ rank-profile collect-second-phase inherits collect-training-data {`
`10`	`10`	`is_favorite`
`11`	`11`	`open_count`
`12`	`12`	`}`
	`13`	`+`
	`14`	`+ # When decaying scores by how old documents are (see below),`
	`15`	`+ # get to 0.0 after 3 years, then remain at 0.0.`
`13`	`16`	`rank-properties {`
`14`	`17`	`freshness(modified_timestamp).maxAge: 94672800 # 3 years in seconds`
`15`	`18`	`}`
	`19`	`+ # If modified_timestamp is now, freshness is 1.0.`
	`20`	`+ # Otherwise, decay linearly towards 0.0 at a rate of 1/3 per year (as configured above).`
`16`	`21`	`function modified_freshness() {`
`17`	`22`	`expression: freshness(modified_timestamp)`
`18`	`23`	`}`
`19`	`24`
	`25`	`+ # Returns 1.0 if the document is a favorite, 0.0 otherwise.`
`20`	`26`	`function is_favorite() {`
`21`	`27`	`expression: if(attribute(favorite), 1.0, 0.0)`
`22`	`28`	`}`
`23`	`29`
	`30`	`+ # Returns the value of the open_count field.`
`24`	`31`	`function open_count() {`
`25`	`32`	`expression: attribute(open_count)`
`26`	`33`	`}`