-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Description
I create a subclass of baseragassembeddings. because I already have all the embeddings for context, query, and question. I did this to not use the openai API key. because it is costly and also I want to use other models like mistral or etc.
The model that I used to create the embeddings is 'text-embedding-ada-002'. but the problem that I have been
dealing with this error:
Exception has occurred: OpenAIError
The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
File "C:\Users\Amin\OneDrive - unige.it\Desktop\tirocini\code\version01rageva.py", line 165, in
evaluation_report = evaluate(ragas_data, metrics=metrics, embeddings=custom_embeddings)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
openai.OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable.
as far as I remember I knew that I do not need for an API key. so please help me fix this error. I also put my code here for taking a look. note that I am using a model from Hugginface, and the model name is :
model_name = 'distilbert-base-uncased'
import json
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np
from datasets import Dataset
from ragas.embeddings import BaseRagasEmbeddings
from ragas.metrics import context_utilization,ContextUtilization
from ragas import evaluate
Load the ground truth data
file_path = 'assets\GT.json'
with open(file_path) as f:
ground_truth_data = json.load(f)
Load the question and the answer and the chunks
file_path = 'assets\user_llm_interaction_embeddings_c1521dd5_b819_4241_b3a4_3e5c1388037c.json'
with open(file_path) as f:
llm = json.load(f)
Initialize an empty list to hold the new dataset
data_set = []
Iterate through the list and combine every two dictionaries
for i in range(0, len(llm), 2):
combined_dict = {
"text_vector_1": llm[i].get("text_vector", []),
"text_vector_2": llm[i + 1].get("text_vector", []),
'chunks': llm[i + 1].get('chunks', [])
}
data_set.append(combined_dict)
def map_chunks(data_set, ground_truth_data):
for item in data_set: # Iterate over each dictionary in data_set
c = [] # Reset c for each item
for chunk_id in item['chunks']: # Loop through 'chunks' in the current dictionary
for element in ground_truth_data: # Loop through ground_truth_data
if element['id'] == chunk_id: # Match chunk_id with element's id
c.append(element['text_vector']) # Append the matching text_vector to c
item['chunks'] = c # Replace the original 'chunks' (ids) with the mapped text_vector values
return data_set # Return the updated data_set
data_set = map_chunks(data_set, ground_truth_data)
Assuming data_set is a list of dictionaries
ragas_data = [
{
"question": entry["text_vector_1"], # Assuming this is a list of strings
"answer": entry["text_vector_2"], # Assuming this is a list of strings
"contexts": entry["chunks"] # Assuming this is a list of lists of strings
}
for entry in data_set
]
Create the required structure for Dataset
formatted_data = {
"question": [entry["question"] for entry in ragas_data],
"contexts": [entry["contexts"] for entry in ragas_data],
"answer": [entry["answer"] for entry in ragas_data]
}
model_name = 'distilbert-base-uncased'
class CustomHuggingFaceRagasEmbeddings(BaseRagasEmbeddings):
def init(self, model_name: str, custom_embeddings: list = None):
"""
Initialize the Custom Hugging Face Ragas Embeddings with the specified model and custom embeddings.
Parameters:
model_name (str): The name of the Hugging Face model to use (e.g., 'distilbert-base-uncased').
custom_embeddings (list): A list of pre-computed custom embeddings (optional).
"""
model_name = 'distilbert-base-uncased'
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name)
self.custom_embeddings = custom_embeddings # Store the custom embeddings
def embed_documents(self, texts: list) -> np.ndarray:
"""
Generate embeddings for a list of documents.
Parameters:
texts (list): A list of documents to embed.
Returns:
np.ndarray: An array of embeddings for the documents.
"""
if self.custom_embeddings is not None:
# If custom embeddings are provided, return those instead
return np.array(self.custom_embeddings)
# Generate new embeddings using the model if no custom embeddings are available
inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
outputs = self.model(**inputs)
# Use the pooled output or the CLS token as the embedding
embeddings = outputs.last_hidden_state[:, 0, :] # CLS token for sentence embedding
return embeddings.numpy() # Convert to NumPy array
def embed_query(self, query: str) -> np.ndarray:
"""
Generate an embedding for a single query.
Parameters:
query (str): The query to embed.
Returns:
np.ndarray: The embedding for the query.
"""
# If custom embeddings are provided, generate embedding based on those
if self.custom_embeddings is not None:
# You might want to handle how to relate the query to your custom embeddings
raise NotImplementedError("Custom query embeddings are not supported with provided custom embeddings.")
# Generate a new embedding using the model
inputs = self.tokenizer(query, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
outputs = self.model(**inputs)
# Use the pooled output or the CLS token as the embedding
embedding = outputs.last_hidden_state[:, 0, :] # CLS token for single query embedding
return embedding.numpy() # Convert to NumPy array
Initialize the custom embeddings class
custom_embeddings = CustomHuggingFaceRagasEmbeddings(ragas_data)
ragas_embeddings = CustomHuggingFaceRagasEmbeddings(model_name=model_name, custom_embeddings=custom_embeddings)
Define the evaluation metrics
metrics = [context_utilization]
#lets define a custom function of evaluate function
#def custom_evaluate(ragas_data, metrics, embeddings: BaseRagasEmbeddings):
"""
Custom evaluation function that avoids using OpenAI API.
Parameters:
dataset: The dataset to evaluate.
metrics: A list of metrics to evaluate.
embeddings: A custom embedding model (subclass of BaseRagasEmbeddings).
Returns:
A dictionary of evaluation results.
"""
results = {}
# Iterate over the metrics and evaluate
#for metric in metrics:
#try:
# Make sure the metric is compatible with the custom embeddings
# if isinstance(metric, ContextUtilization): # Check for specific metric type
#result = evaluate(ragas_data, metric, embeddings=embeddings)
# results[metric.name] = result
# except Exception as e:
# print(f"Error while evaluating metric {metric.name}: {e}")
return results
Run the evaluation
evaluation_report = evaluate(ragas_data, metrics=metrics, embeddings=custom_embeddings)
Print the evaluation results
print("RAGAS Evaluation Report:")
print(evaluation_report)