Skip to content

Commit 82a427d

Browse files
committed
🛞 Add ResponseRater model which automatically fetches embeddings
1 parent 8e6834e commit 82a427d

File tree

11 files changed

+8952
-829
lines changed

11 files changed

+8952
-829
lines changed

.codespellignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
astroid

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,11 @@ rater = EmbeddingsRater(reference_data, embeddings_column='embedding_small')
4848

4949
# Convert LLM response embeddings to probability distributions
5050
llm_responses = np.random.rand(10, 384)
51-
pdfs = rater.get_response_pdfs('set1', llm_responses)
51+
pmfs = rater.get_response_pmfs('set1', llm_responses)
5252

5353
# Get overall survey distribution
54-
survey_pdf = rater.get_survey_response_pdf(pdfs)
55-
print(f"Survey distribution: {survey_pdf}")
54+
survey_pmf = rater.get_survey_response_pmf(pmfs)
55+
print(f"Survey distribution: {survey_pmf}")
5656
```
5757

5858
## Methodology
@@ -66,8 +66,8 @@ The ESR methodology works by:
6666
## Core Components
6767

6868
- `EmbeddingsRater`: Main class implementing the ESR methodology
69-
- `response_embeddings_to_pdf()`: Core function for similarity-to-probability conversion
70-
- `scale_pdf()`: Temperature scaling function
69+
- `response_embeddings_to_pmf()`: Core function for similarity-to-probability conversion
70+
- `scale_pmf()`: Temperature scaling function
7171

7272
## Citation
7373

embeddings_similarity_rating/__init__.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,18 @@
1010

1111
from beartype.claw import beartype_this_package
1212

13-
from .compute import response_embeddings_to_pdf, scale_pdf
13+
from .compute import response_embeddings_to_pmf, scale_pmf
1414
from .embeddings_rater import EmbeddingsRater
15+
from .response_rater import ResponseRater
1516

1617
__version__ = "1.0.0"
1718
__author__ = "Ben F. Maier, Ulf Aslak"
1819

1920
__all__ = [
2021
"EmbeddingsRater",
21-
"response_embeddings_to_pdf",
22-
"scale_pdf",
22+
"ResponseRater",
23+
"response_embeddings_to_pmf",
24+
"scale_pmf",
2325
]
2426

2527
beartype_this_package()
Lines changed: 55 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
"""
2-
Utility functions for computing and manipulating probability density functions (PDFs) and embeddings.
2+
Utility functions for computing and manipulating probability density functions (PMFs) and embeddings.
33
44
This module provides functions for:
55
- Converting between different similarity metrics (cosine, KS)
6-
- Scaling PDFs using temperature parameters
7-
- Computing statistical moments of PDFs
8-
- Finding optimal temperature parameters for PDF scaling
9-
- Converting response embeddings to PDFs
6+
- Scaling PMFs using temperature parameters
7+
- Computing statistical moments of PMFs
8+
- Finding optimal temperature parameters for PMF scaling
9+
- Converting response embeddings to PMFs
1010
1111
The module is particularly useful for working with Likert scale responses and their
1212
embeddings, providing tools to analyze and transform the underlying probability
@@ -16,13 +16,13 @@
1616
import numpy as np
1717

1818

19-
def scale_pdf(pdf, temperature, max_temp=np.inf):
19+
def scale_pmf(pmf, temperature, max_temp=np.inf):
2020
"""
21-
Scale a PDF using temperature scaling.
21+
Scale a PMF using temperature scaling.
2222
2323
Parameters
2424
----------
25-
pdf : array_like
25+
pmf : array_like
2626
Input probability density function
2727
temperature : float
2828
Temperature parameter for scaling (0 to max_temp)
@@ -32,7 +32,7 @@ def scale_pdf(pdf, temperature, max_temp=np.inf):
3232
Returns
3333
-------
3434
numpy.ndarray
35-
Scaled PDF where all values sum to 1
35+
Scaled PMF where all values sum to 1
3636
3737
Notes
3838
-----
@@ -41,38 +41,53 @@ def scale_pdf(pdf, temperature, max_temp=np.inf):
4141
- Otherwise uses the specified temperature for scaling
4242
"""
4343
if temperature == 0.0:
44-
if np.all(pdf == pdf[0]):
45-
return pdf
44+
if np.all(pmf == pmf[0]):
45+
return pmf
4646
else:
47-
new_pdf = np.zeros_like(pdf)
48-
new_pdf[np.argmax(pdf)] = 1.0
49-
return new_pdf
47+
new_pmf = np.zeros_like(pmf)
48+
new_pmf[np.argmax(pmf)] = 1.0
49+
return new_pmf
5050
elif temperature > max_temp:
51-
hist = pdf ** (1 / max_temp)
51+
hist = pmf ** (1 / max_temp)
5252
else:
53-
hist = pdf ** (1 / temperature)
53+
hist = pmf ** (1 / temperature)
5454
return hist / hist.sum()
5555

5656

57-
def response_embeddings_to_pdf(matrix_responses, matrix_likert_sentences):
57+
def response_embeddings_to_pmf(matrix_responses, matrix_likert_sentences, epsilon=0.0):
5858
"""
59-
Convert response embeddings and Likert sentence embeddings to a PDF.
59+
Convert response embeddings and Likert sentence embeddings to a PMF.
6060
6161
Parameters
6262
----------
6363
matrix_responses : array_like
6464
Matrix of response embeddings
6565
matrix_likert_sentences : array_like
6666
Matrix of Likert sentence embeddings
67+
epsilon : float, optional
68+
Small regularization parameter to prevent division by zero and add smoothing.
69+
Default is 0.0 (no regularization).
6770
6871
Returns
6972
-------
7073
numpy.ndarray
7174
Probability density function representing the response distribution
75+
76+
Notes
77+
-----
78+
This implements the ESR equation:
79+
p_{c,i}(r) = [γ(σ_{r,i}, t_c̃) - γ(σ_ℓ,i, t_c̃) + ε δ_ℓ,r] /
80+
[Σ_r γ(σ_{r,i}, t_c̃) - n_points * γ(σ_ℓ,i, t_c̃) + ε]
81+
where γ is the cosine similarity function, δ_ℓ,r is the Kronecker delta,
82+
and n_points is the number of Likert scale points.
7283
"""
7384
M_left = matrix_responses
7485
M_right = matrix_likert_sentences
7586

87+
# Handle empty input case
88+
if M_left.shape[0] == 0:
89+
return np.empty((0, M_right.shape[1]))
90+
7691
# Normalize the right matrix (Likert sentences)
7792
norm_right = np.linalg.norm(M_right, axis=0)
7893
M_right = M_right / norm_right[None, :]
@@ -81,10 +96,27 @@ def response_embeddings_to_pdf(matrix_responses, matrix_likert_sentences):
8196
norm_left = np.linalg.norm(M_left, axis=1)
8297
M_left = M_left / norm_left[:, None]
8398

84-
# Calculate cosine similarities and convert to PDF
99+
# Calculate cosine similarities: γ(σ_{r,i}, t_c̃)
85100
cos = (1 + M_left.dot(M_right)) / 2
86-
cos = cos - cos.min(axis=1)[:, None]
87-
sum_per_row = cos.sum(axis=1)
88-
pdf = cos / sum_per_row[:, None]
89101

90-
return pdf
102+
# Find minimum similarity per row: γ(σ_ℓ,i, t_c̃)
103+
cos_min = cos.min(axis=1)[:, None]
104+
105+
# Numerator: γ(σ_{r,i}, t_c̃) - γ(σ_ℓ,i, t_c̃) + ε δ_ℓ,r
106+
# The ε δ_ℓ,r term adds epsilon only to exactly one minimum similarity position per row
107+
numerator = cos - cos_min
108+
if epsilon > 0:
109+
# Add epsilon to the first position that achieves minimum in each row (Kronecker delta effect)
110+
min_indices = np.argmin(cos, axis=1)
111+
for i, min_idx in enumerate(min_indices):
112+
numerator[i, min_idx] += epsilon
113+
114+
# Denominator: Σ_r γ(σ_{r,i}, t_c̃) - n_likert_points * γ(σ_ℓ,i, t_c̃) + ε
115+
# This is: sum of all similarities - n_likert_points * minimum similarity + epsilon
116+
n_likert_points = cos.shape[1]
117+
denominator = cos.sum(axis=1)[:, None] - n_likert_points * cos_min + epsilon
118+
119+
# Calculate final PMF
120+
pmf = numerator / denominator
121+
122+
return pmf

embeddings_similarity_rating/embeddings_rater.py

Lines changed: 39 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
This module provides functionality to:
55
- Validate reference sentence data structure
66
- Convert LLM response embeddings to probability distributions
7-
- Calculate survey response PDFs using different reference sets
7+
- Calculate survey response PMFs using different reference sets
88
- Compare responses against mean or specific reference sets
99
1010
The module is particularly useful for analyzing Likert scale responses from LLMs
@@ -88,10 +88,10 @@ class EmbeddingsRater:
8888
>>> # Initialize rater
8989
>>> rater = EmbeddingsRater(df, embeddings_column='embedding_small')
9090
>>>
91-
>>> # Get PDFs for some LLM responses
91+
>>> # Get PMFs for some LLM responses
9292
>>> llm_responses = np.random.rand(5, 384) # 5 responses, each with 384-dim embedding
93-
>>> pdfs = rater.get_response_pdfs('set1', llm_responses)
94-
>>> survey_pdf = rater.get_survey_response_pdf(pdfs)
93+
>>> pmfs = rater.get_response_pmfs('set1', llm_responses)
94+
>>> survey_pmf = rater.get_survey_response_pmf(pmfs)
9595
"""
9696

9797
def __init__(
@@ -124,9 +124,11 @@ def __init__(
124124
M = np.array(this_set[embeddings_column].to_list()).T
125125
self.reference_matrices[sentence_set] = M
126126

127-
def get_response_pdfs(self, reference_set_id, llm_response_matrix, temperature=1.0):
127+
def get_response_pmfs(
128+
self, reference_set_id, llm_response_matrix, temperature=1.0, epsilon=0.0
129+
):
128130
"""
129-
Convert LLM response embeddings to PDFs using specified reference set.
131+
Convert LLM response embeddings to PMFs using specified reference set.
130132
131133
Parameters
132134
----------
@@ -136,57 +138,60 @@ def get_response_pdfs(self, reference_set_id, llm_response_matrix, temperature=1
136138
Matrix of LLM response embeddings
137139
Shape: (n_responses, n_dimensions)
138140
temperature : float
139-
Get scaled pdf With temperature T:
141+
Get scaled pmf With temperature T:
140142
``p_new[i] ~ p_old[i]^(1/T)``.
143+
epsilon : float, optional
144+
Small regularization parameter to prevent division by zero and add smoothing.
145+
Default is 0.0 (no regularization).
141146
142147
Returns
143148
-------
144149
numpy.ndarray
145150
Probability distributions for each response
146151
"""
147152
if isinstance(reference_set_id, str) and reference_set_id.lower() == "mean":
148-
# Calculate PDFs using mean over all reference sets
149-
llm_response_pdfs = np.array(
153+
# Calculate PMFs using mean over all reference sets
154+
llm_response_pmfs = np.array(
150155
[
151-
compute.response_embeddings_to_pdf(llm_response_matrix, M)
156+
compute.response_embeddings_to_pmf(llm_response_matrix, M, epsilon)
152157
for M in self.reference_matrices.values()
153158
]
154159
).mean(axis=0)
155160
else:
156-
# Calculate PDFs using specific reference set
161+
# Calculate PMFs using specific reference set
157162
M = self.reference_matrices[reference_set_id]
158-
llm_response_pdfs = compute.response_embeddings_to_pdf(
159-
llm_response_matrix, M
163+
llm_response_pmfs = compute.response_embeddings_to_pmf(
164+
llm_response_matrix, M, epsilon
160165
)
161166

162167
if temperature != 1.0:
163-
llm_response_pdfs = np.array(
164-
[compute.scale_pdf(_pdf, temperature) for _pdf in llm_response_pdfs]
168+
llm_response_pmfs = np.array(
169+
[compute.scale_pmf(_pmf, temperature) for _pmf in llm_response_pmfs]
165170
)
166171

167-
return llm_response_pdfs
172+
return llm_response_pmfs
168173

169-
def get_survey_response_pdf(self, response_pdfs):
174+
def get_survey_response_pmf(self, response_pmfs):
170175
"""
171-
Calculate the overall survey response PDF by averaging individual response PDFs.
176+
Calculate the overall survey response PMF by averaging individual response PMFs.
172177
173178
Parameters
174179
----------
175-
response_pdfs : numpy.ndarray
176-
Matrix of individual response PDFs
180+
response_pmfs : numpy.ndarray
181+
Matrix of individual response PMFs
177182
178183
Returns
179184
-------
180185
numpy.ndarray
181-
Average PDF representing the overall survey response
186+
Average PMF representing the overall survey response
182187
"""
183-
return response_pdfs.mean(axis=0)
188+
return response_pmfs.mean(axis=0)
184189

185-
def get_survey_response_pdf_by_reference_set_id(
186-
self, reference_set_id, llm_response_matrix, temperature=1.0
190+
def get_survey_response_pmf_by_reference_set_id(
191+
self, reference_set_id, llm_response_matrix, temperature=1.0, epsilon=0.0
187192
):
188193
"""
189-
Get the survey response PDF using a specific reference set.
194+
Get the survey response PMF using a specific reference set.
190195
191196
Parameters
192197
----------
@@ -196,14 +201,19 @@ def get_survey_response_pdf_by_reference_set_id(
196201
Matrix of LLM response embeddings
197202
Shape: (n_responses, n_dimensions)
198203
temperature : float, default = 1.0
199-
Get scaled pdf With temperature T:
204+
Get scaled pmf With temperature T:
200205
``p_new[i] ~ p_old[i]^(1/T)``.
206+
epsilon : float, optional
207+
Small regularization parameter to prevent division by zero and add smoothing.
208+
Default is 0.0 (no regularization).
201209
202210
Returns
203211
-------
204212
numpy.ndarray
205-
Average PDF representing the overall survey response
213+
Average PMF representing the overall survey response
206214
"""
207-
return self.get_survey_response_pdf(
208-
self.get_response_pdfs(reference_set_id, llm_response_matrix)
215+
return self.get_survey_response_pmf(
216+
self.get_response_pmfs(
217+
reference_set_id, llm_response_matrix, temperature, epsilon
218+
)
209219
)

0 commit comments

Comments
 (0)