@@ -4,47 +4,111 @@ rank-profile base-features {
44 query(float_embedding) tensor<float>(x[768])
55 }
66
7+ # Fixed length chunking should not cause any positional gap between elements
8+ # => lexical search (e.g., chunks contains 'hello world') works as if we had a big blob of text,
9+ # (e.g., would match ["and she said hello", "world!"])
710 rank chunks {
8- element-gap: 0 # Fixed length chunking should not cause any positional gap between elements
11+ element-gap: 0
912 }
13+
14+ # Creates a tensor with a single mapped dimension (i.e., the chunk ID)
15+ # and the BM25 score for each chunk. Returns something like:
16+ #{
17+ # "type": "tensor(chunk{})",
18+ # "cells": {
19+ # "2": 0.5112776,
20+ # "5": 1.1021805
21+ # }
22+ #}
1023 function chunk_text_scores() {
1124 expression: elementwise(bm25(chunks),chunk,float)
1225 }
1326
27+ # Unpacks the 8 bits of each integer from every chunk into 8 floats representing their values
28+ # effectively transforms tensor<int8>(chunk{}, x[96]) into tensor<float>(chunk{}, x[768]).
29+ # This prepares the bit embeddings for dot product with the query embedding. Returns something like:
30+ #{
31+ # "type": "tensor(chunk{})",
32+ # "cells": {
33+ # "2": [0.0, 1.0, 0.0, 0.0, 1.0, ... up to 768 floats],
34+ # "5": [1.0, 0.0, 1.0, 0.0, 1.0, ... up to 768 floats]
35+ # }
36+ #}
1437 function chunk_emb_vecs() {
1538 expression: unpack_bits(attribute(chunk_embeddings))
1639 }
1740
41+ # Computes the dot product of the query embedding with each chunk embedding.
42+ # Returns a tensor with a similarity score for each chunk ID. E.g.,
43+ #{
44+ # "type": "tensor(chunk{})",
45+ # "cells": {
46+ # "2": 8.7, # e.g., this is the dot product of the query embedding with the chunk embedding for chunk ID 2
47+ # "5": 1.2
48+ # }
49+ #}
1850 function chunk_dot_prod() {
1951 expression: reduce(query(float_embedding) * chunk_emb_vecs(), sum, x)
2052 }
2153
54+
55+ # Computes the magnitude (Euclidean norm) of all vectors in a tensor.
56+ # We will use this to normalize (i.e., bring back to 0-1 range) the chunk dot product,
57+ # which will tend to be higher for embeddings with more dimensions. This normalized
58+ # dot product is the cosine similarity.
2259 function vector_norms(t) {
2360 expression: sqrt(sum(pow(t, 2), x))
2461 }
62+
63+ # Computes the cosine similarity between the query embedding and each chunk embedding,
64+ # by dividing the dot product by the product of the magnitudes of the two vectors.
65+ #
66+ # Returns a tensor with a similarity score for each chunk ID. E.g.,
67+ #{
68+ # "type": "tensor(chunk{})",
69+ # "cells": {
70+ # "2": 0.98, # e.g., this is the cosine similarity between the query embedding and the chunk embedding for chunk ID 2
71+ # "5": 0.75 # notice how values are normalized to 0-1 range, unlike the dot product
72+ # }
73+ #}
2574 function chunk_sim_scores() {
2675 expression: chunk_dot_prod() / (vector_norms(chunk_emb_vecs()) * vector_norms(query(float_embedding)))
2776 }
2877
78+ # Returns a tensor with the top 3 chunk IDs by their BM25 lexical scores. E.g.,
79+ #{
80+ # "type": "tensor(chunk{})",
81+ # "cells": {
82+ # "3": 3.8021805,
83+ # "5": 1.1021805,
84+ # "2": 0.5112776
85+ # }
86+ #}
2987 function top_3_chunk_text_scores() {
3088 expression: top(3, chunk_text_scores())
3189 }
3290
91+ # Returns a tensor with the top 3 chunk IDs by their cosine similarity scores.
3392 function top_3_chunk_sim_scores() {
3493 expression: top(3, chunk_sim_scores())
3594 }
3695
96+ # Returns the average of the top 3 chunks' BM25 scores.
3797 function avg_top_3_chunk_text_scores() {
3898 expression: reduce(top_3_chunk_text_scores(), avg, chunk)
3999 }
100+
101+ # Returns the average of the top 3 chunks' cosine similarity scores.
40102 function avg_top_3_chunk_sim_scores() {
41103 expression: reduce(top_3_chunk_sim_scores(), avg, chunk)
42104 }
43105
106+ # Returns the maximum of the chunk BM25 lexical scores.
44107 function max_chunk_text_scores() {
45108 expression: reduce(chunk_text_scores(), max, chunk)
46109 }
47110
111+ # Returns the maximum of the chunk cosine similarity scores.
48112 function max_chunk_sim_scores() {
49113 expression: reduce(chunk_sim_scores(), max, chunk)
50114 }
0 commit comments