Skip to content

Commit 2723b24

Browse files
Merge pull request #1810 from vespa-engine/rag_blueprint_rank_profiles_explanation
Rag blueprint rank profiles explanation
2 parents c0015d2 + c4d4ea0 commit 2723b24

File tree

3 files changed

+76
-2
lines changed

3 files changed

+76
-2
lines changed

rag-blueprint/app/schemas/doc.sd

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,10 @@ schema doc {
7373
from-disk
7474
summary chunks_top3 {
7575
source: chunks
76-
select-elements-by: top_3_chunk_sim_scores #this needs to be added a summary-feature to the rank-profile
76+
# top_3_chunk_sim_scores needs to be added as a summary-feature to the rank-profile.
77+
# It should be a tensor with a single mapped dimension (i.e., the chunk ID),
78+
# like it is in base-features.profile
79+
select-elements-by: top_3_chunk_sim_scores
7780
}
7881
}
7982
}

rag-blueprint/app/schemas/doc/base-features.profile

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,47 +4,111 @@ rank-profile base-features {
44
query(float_embedding) tensor<float>(x[768])
55
}
66

7+
# Fixed length chunking should not cause any positional gap between elements
8+
# => lexical search (e.g., chunks contains 'hello world') works as if we had a big blob of text,
9+
# (e.g., would match ["and she said hello", "world!"])
710
rank chunks {
8-
element-gap: 0 # Fixed length chunking should not cause any positional gap between elements
11+
element-gap: 0
912
}
13+
14+
# Creates a tensor with a single mapped dimension (i.e., the chunk ID)
15+
# and the BM25 score for each chunk. Returns something like:
16+
#{
17+
# "type": "tensor(chunk{})",
18+
# "cells": {
19+
# "2": 0.5112776,
20+
# "5": 1.1021805
21+
# }
22+
#}
1023
function chunk_text_scores() {
1124
expression: elementwise(bm25(chunks),chunk,float)
1225
}
1326

27+
# Unpacks the 8 bits of each integer from every chunk into 8 floats representing their values
28+
# effectively transforms tensor<int8>(chunk{}, x[96]) into tensor<float>(chunk{}, x[768]).
29+
# This prepares the bit embeddings for dot product with the query embedding. Returns something like:
30+
#{
31+
# "type": "tensor(chunk{})",
32+
# "cells": {
33+
# "2": [0.0, 1.0, 0.0, 0.0, 1.0, ... up to 768 floats],
34+
# "5": [1.0, 0.0, 1.0, 0.0, 1.0, ... up to 768 floats]
35+
# }
36+
#}
1437
function chunk_emb_vecs() {
1538
expression: unpack_bits(attribute(chunk_embeddings))
1639
}
1740

41+
# Computes the dot product of the query embedding with each chunk embedding.
42+
# Returns a tensor with a similarity score for each chunk ID. E.g.,
43+
#{
44+
# "type": "tensor(chunk{})",
45+
# "cells": {
46+
# "2": 8.7, # e.g., this is the dot product of the query embedding with the chunk embedding for chunk ID 2
47+
# "5": 1.2
48+
# }
49+
#}
1850
function chunk_dot_prod() {
1951
expression: reduce(query(float_embedding) * chunk_emb_vecs(), sum, x)
2052
}
2153

54+
55+
# Computes the magnitude (Euclidean norm) of all vectors in a tensor.
56+
# We will use this to normalize (i.e., bring back to 0-1 range) the chunk dot product,
57+
# which will tend to be higher for embeddings with more dimensions. This normalized
58+
# dot product is the cosine similarity.
2259
function vector_norms(t) {
2360
expression: sqrt(sum(pow(t, 2), x))
2461
}
62+
63+
# Computes the cosine similarity between the query embedding and each chunk embedding,
64+
# by dividing the dot product by the product of the magnitudes of the two vectors.
65+
#
66+
# Returns a tensor with a similarity score for each chunk ID. E.g.,
67+
#{
68+
# "type": "tensor(chunk{})",
69+
# "cells": {
70+
# "2": 0.98, # e.g., this is the cosine similarity between the query embedding and the chunk embedding for chunk ID 2
71+
# "5": 0.75 # notice how values are normalized to 0-1 range, unlike the dot product
72+
# }
73+
#}
2574
function chunk_sim_scores() {
2675
expression: chunk_dot_prod() / (vector_norms(chunk_emb_vecs()) * vector_norms(query(float_embedding)))
2776
}
2877

78+
# Returns a tensor with the top 3 chunk IDs by their BM25 lexical scores. E.g.,
79+
#{
80+
# "type": "tensor(chunk{})",
81+
# "cells": {
82+
# "3": 3.8021805,
83+
# "5": 1.1021805,
84+
# "2": 0.5112776
85+
# }
86+
#}
2987
function top_3_chunk_text_scores() {
3088
expression: top(3, chunk_text_scores())
3189
}
3290

91+
# Returns a tensor with the top 3 chunk IDs by their cosine similarity scores.
3392
function top_3_chunk_sim_scores() {
3493
expression: top(3, chunk_sim_scores())
3594
}
3695

96+
# Returns the average of the top 3 chunks' BM25 scores.
3797
function avg_top_3_chunk_text_scores() {
3898
expression: reduce(top_3_chunk_text_scores(), avg, chunk)
3999
}
100+
101+
# Returns the average of the top 3 chunks' cosine similarity scores.
40102
function avg_top_3_chunk_sim_scores() {
41103
expression: reduce(top_3_chunk_sim_scores(), avg, chunk)
42104
}
43105

106+
# Returns the maximum of the chunk BM25 lexical scores.
44107
function max_chunk_text_scores() {
45108
expression: reduce(chunk_text_scores(), max, chunk)
46109
}
47110

111+
# Returns the maximum of the chunk cosine similarity scores.
48112
function max_chunk_sim_scores() {
49113
expression: reduce(chunk_sim_scores(), max, chunk)
50114
}

rag-blueprint/app/schemas/doc/collect-second-phase.profile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,24 @@ rank-profile collect-second-phase inherits collect-training-data {
1010
is_favorite
1111
open_count
1212
}
13+
14+
# When decaying scores by how old documents are (see below),
15+
# get to 0.0 after 3 years, then remain at 0.0.
1316
rank-properties {
1417
freshness(modified_timestamp).maxAge: 94672800 # 3 years in seconds
1518
}
19+
# If modified_timestamp is now, freshness is 1.0.
20+
# Otherwise, decay linearly towards 0.0 at a rate of 1/3 per year (as configured above).
1621
function modified_freshness() {
1722
expression: freshness(modified_timestamp)
1823
}
1924

25+
# Returns 1.0 if the document is a favorite, 0.0 otherwise.
2026
function is_favorite() {
2127
expression: if(attribute(favorite), 1.0, 0.0)
2228
}
2329

30+
# Returns the value of the open_count field.
2431
function open_count() {
2532
expression: attribute(open_count)
2633
}

0 commit comments

Comments
 (0)