-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathzero_shot_latent_embedding.py
More file actions
92 lines (71 loc) · 4.1 KB
/
zero_shot_latent_embedding.py
File metadata and controls
92 lines (71 loc) · 4.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import torch
from sentence_transformers import SentenceTransformer, util
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
import os
import string
import time
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import multilabel_confusion_matrix, precision_recall_fscore_support
def run_zero_shot_latent_emb(df, candidate_labels, candidate_label_embeddings, sbert_model, transfer_matrix, experiment_name, run_name, experiment_dir):
mlflow.set_tracking_uri(experiment_dir)
experiment_active = mlflow.set_experiment(experiment_name)
# Instantiate multi-label binarizer (one-hot encoded labels) and apply it to our labels
mlb = MultiLabelBinarizer()
mlb.fit([candidate_labels])
print("Running for " + run_name + "...")
# Create dataframe to store predictions
prodigy_test_case_data_predictions = pd.DataFrame(columns=['id', 'type', 'description', 'threshold', 'labels', 'predicted_labels',
'label_encoded', 'predicted_label_encoded', 'prediction_prob_score'])
index_add = 0
# Thresholds to include predicted labels (prediction confidence - probability)
threshold_pred = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
start_time = time.time()
for index,row in df.iterrows():
sequence_classify = row['description']
sentence_embedding = sbert_model.encode(sequence_classify)
# Apply linear transformation to sentence embedding
sentence_embedding = torch.tensor(sentence_embedding)
sentence_embedding = torch.reshape(sentence_embedding, (1, len(sentence_embedding)))
sentence_embedding_transformed = torch.mm(sentence_embedding, transfer_matrix)
for threshold in threshold_pred:
labels_to_include = set()
scores_to_include = []
for label, label_embedding in zip(candidate_labels, candidate_label_embeddings):
cos_sim = util.cos_sim(sentence_embedding_transformed, label_embedding)
if cos_sim > threshold:
labels_to_include.add(label)
scores_to_include.append(cos_sim.numpy()[0][0])
# Encode labels: ground truth and predicted
correct_labels = row['labels']
encoded_correct_labels = mlb.transform([correct_labels])
encoded_labels_to_include = mlb.transform([labels_to_include])
# Update prediction df
prodigy_test_case_data_predictions.loc[index_add] = [row['id'], row['type'], row['description'], threshold, correct_labels, labels_to_include,
encoded_correct_labels, encoded_labels_to_include, scores_to_include]
index_add += 1
# Get df for each threshold
for threshold in threshold_pred:
with mlflow.start_run(experiment_id=experiment_active.experiment_id, run_name=run_name):
threshold_df = prodigy_test_case_data_predictions[prodigy_test_case_data_predictions['threshold'] == threshold]
y_true = []
y_pred = []
for index,row in threshold_df.iterrows():
label_encoded = row['label_encoded']
predicted_label_encoded = row['predicted_label_encoded']
y_true.append(label_encoded[0])
y_pred.append(predicted_label_encoded[0])
# Get metrics
metrics = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)
precision = metrics[0]
recall = metrics[1]
fscore = metrics[2]
mlflow.log_param("confidence_threshold", threshold)
mlflow.log_metric("precision", precision)
mlflow.log_metric("recall", recall)
mlflow.log_metric("fscore", fscore)
print("Execution for " + run_name + " finished!")
end_time = time.time()
print("Execution finished with " + str((end_time - start_time)/60) + " minutes.")