Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion rag-blueprint/app/schemas/doc.sd
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,10 @@ schema doc {
from-disk
summary chunks_top3 {
source: chunks
select-elements-by: top_3_chunk_sim_scores #this needs to be added a summary-feature to the rank-profile
# top_3_chunk_sim_scores needs to be added as a summary-feature to the rank-profile.
# It should be a tensor with a single mapped dimension (i.e., the chunk ID),
# like it is in base-features.profile
select-elements-by: top_3_chunk_sim_scores
}
}
}
66 changes: 65 additions & 1 deletion rag-blueprint/app/schemas/doc/base-features.profile
Original file line number Diff line number Diff line change
Expand Up @@ -4,47 +4,111 @@ rank-profile base-features {
query(float_embedding) tensor<float>(x[768])
}

# Fixed length chunking should not cause any positional gap between elements
# => lexical search (e.g., chunks contains 'hello world') works as if we had a big blob of text,
# (e.g., would match ["and she said hello", "world!"])
rank chunks {
element-gap: 0 # Fixed length chunking should not cause any positional gap between elements
element-gap: 0
}

# Creates a tensor with a single mapped dimension (i.e., the chunk ID)
# and the BM25 score for each chunk. Returns something like:
#{
# "type": "tensor(chunk{})",
# "cells": {
# "2": 0.5112776,
# "5": 1.1021805
# }
#}
function chunk_text_scores() {
expression: elementwise(bm25(chunks),chunk,float)
}

# Unpacks the 8 bits of each integer from every chunk into 8 floats representing their values
# effectively transforms tensor<int8>(chunk{}, x[96]) into tensor<float>(chunk{}, x[768]).
# This prepares the bit embeddings for dot product with the query embedding. Returns something like:
#{
# "type": "tensor(chunk{})",
# "cells": {
# "2": [0.0, 1.0, 0.0, 0.0, 1.0, ... up to 768 floats],
# "5": [1.0, 0.0, 1.0, 0.0, 1.0, ... up to 768 floats]
# }
#}
function chunk_emb_vecs() {
expression: unpack_bits(attribute(chunk_embeddings))
}

# Computes the dot product of the query embedding with each chunk embedding.
# Returns a tensor with a similarity score for each chunk ID. E.g.,
#{
# "type": "tensor(chunk{})",
# "cells": {
# "2": 8.7, # e.g., this is the dot product of the query embedding with the chunk embedding for chunk ID 2
# "5": 1.2
# }
#}
function chunk_dot_prod() {
expression: reduce(query(float_embedding) * chunk_emb_vecs(), sum, x)
}


# Computes the magnitude (Euclidean norm) of all vectors in a tensor.
# We will use this to normalize (i.e., bring back to 0-1 range) the chunk dot product,
# which will tend to be higher for embeddings with more dimensions. This normalized
# dot product is the cosine similarity.
function vector_norms(t) {
expression: sqrt(sum(pow(t, 2), x))
}

# Computes the cosine similarity between the query embedding and each chunk embedding,
# by dividing the dot product by the product of the magnitudes of the two vectors.
#
# Returns a tensor with a similarity score for each chunk ID. E.g.,
#{
# "type": "tensor(chunk{})",
# "cells": {
# "2": 0.98, # e.g., this is the cosine similarity between the query embedding and the chunk embedding for chunk ID 2
# "5": 0.75 # notice how values are normalized to 0-1 range, unlike the dot product
# }
#}
function chunk_sim_scores() {
expression: chunk_dot_prod() / (vector_norms(chunk_emb_vecs()) * vector_norms(query(float_embedding)))
}

# Returns a tensor with the top 3 chunk IDs by their BM25 lexical scores. E.g.,
#{
# "type": "tensor(chunk{})",
# "cells": {
# "3": 3.8021805,
# "5": 1.1021805,
# "2": 0.5112776
# }
#}
function top_3_chunk_text_scores() {
expression: top(3, chunk_text_scores())
}

# Returns a tensor with the top 3 chunk IDs by their cosine similarity scores.
function top_3_chunk_sim_scores() {
expression: top(3, chunk_sim_scores())
}

# Returns the average of the top 3 chunks' BM25 scores.
function avg_top_3_chunk_text_scores() {
expression: reduce(top_3_chunk_text_scores(), avg, chunk)
}

# Returns the average of the top 3 chunks' cosine similarity scores.
function avg_top_3_chunk_sim_scores() {
expression: reduce(top_3_chunk_sim_scores(), avg, chunk)
}

# Returns the maximum of the chunk BM25 lexical scores.
function max_chunk_text_scores() {
expression: reduce(chunk_text_scores(), max, chunk)
}

# Returns the maximum of the chunk cosine similarity scores.
function max_chunk_sim_scores() {
expression: reduce(chunk_sim_scores(), max, chunk)
}
Expand Down
7 changes: 7 additions & 0 deletions rag-blueprint/app/schemas/doc/collect-second-phase.profile
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,24 @@ rank-profile collect-second-phase inherits collect-training-data {
is_favorite
open_count
}

# When decaying scores by how old documents are (see below),
# get to 0.0 after 3 years, then remain at 0.0.
rank-properties {
freshness(modified_timestamp).maxAge: 94672800 # 3 years in seconds
}
# If modified_timestamp is now, freshness is 1.0.
# Otherwise, decay linearly towards 0.0 at a rate of 1/3 per year (as configured above).
function modified_freshness() {
expression: freshness(modified_timestamp)
}

# Returns 1.0 if the document is a favorite, 0.0 otherwise.
function is_favorite() {
expression: if(attribute(favorite), 1.0, 0.0)
}

# Returns the value of the open_count field.
function open_count() {
expression: attribute(open_count)
}
Expand Down