evaluate function

I create a subclass of baseragassembeddings. because I already have all the embeddings for context, query, and question. I did this to not use the openai API key. because it is costly and also I want to use other models like mistral or etc.
The model that I used to create the embeddings is 'text-embedding-ada-002'. but the problem that I have been 
dealing with this error:
Exception has occurred: OpenAIError
The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
  File "C:\Users\Amin\OneDrive - unige.it\Desktop\tirocini\code\version01rageva.py", line 165, in <module>
    evaluation_report = evaluate(ragas_data, metrics=metrics, embeddings=custom_embeddings)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
openai.OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable.

as far as I remember I knew that I do not need for an API key. so please help me fix this error. I also put my code here for taking a look. note that I am using a model from Hugginface, and the model name is :
model_name = 'distilbert-base-uncased'

import json
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np
from datasets import Dataset
from ragas.embeddings import BaseRagasEmbeddings
from ragas.metrics import context_utilization,ContextUtilization
from ragas import evaluate

# Load the ground truth data
file_path = 'assets\\GT.json'
with open(file_path) as f:
    ground_truth_data = json.load(f)

# Load the question and the answer and the chunks
file_path = 'assets\\user_llm_interaction_embeddings_c1521dd5_b819_4241_b3a4_3e5c1388037c.json'
with open(file_path) as f:
    llm = json.load(f)

# Initialize an empty list to hold the new dataset
data_set = []

# Iterate through the list and combine every two dictionaries
for i in range(0, len(llm), 2):
    combined_dict = {
        "text_vector_1": llm[i].get("text_vector", []),
        "text_vector_2": llm[i + 1].get("text_vector", []),
        'chunks': llm[i + 1].get('chunks', [])
    }
    data_set.append(combined_dict)

def map_chunks(data_set, ground_truth_data):
    for item in data_set:  # Iterate over each dictionary in data_set
        c = []  # Reset c for each item
        for chunk_id in item['chunks']:  # Loop through 'chunks' in the current dictionary
            for element in ground_truth_data:  # Loop through ground_truth_data
                if element['id'] == chunk_id:  # Match chunk_id with element's id
                    c.append(element['text_vector'])  # Append the matching text_vector to c
        item['chunks'] = c  # Replace the original 'chunks' (ids) with the mapped text_vector values

    return data_set  # Return the updated data_set

data_set = map_chunks(data_set, ground_truth_data)

# Assuming data_set is a list of dictionaries
ragas_data = [
    {
        "question": entry["text_vector_1"],  # Assuming this is a list of strings
        "answer": entry["text_vector_2"],  # Assuming this is a list of strings
        "contexts": entry["chunks"]  # Assuming this is a list of lists of strings
    }
    for entry in data_set
]

# Create the required structure for Dataset
formatted_data = {
    "question": [entry["question"] for entry in ragas_data],
    "contexts": [entry["contexts"] for entry in ragas_data],
    "answer": [entry["answer"] for entry in ragas_data]
}

model_name = 'distilbert-base-uncased'

class CustomHuggingFaceRagasEmbeddings(BaseRagasEmbeddings):
    def __init__(self, model_name: str, custom_embeddings: list = None):
        """
        Initialize the Custom Hugging Face Ragas Embeddings with the specified model and custom embeddings.
        
        Parameters:
            model_name (str): The name of the Hugging Face model to use (e.g., 'distilbert-base-uncased').
            custom_embeddings (list): A list of pre-computed custom embeddings (optional).
        """
        model_name = 'distilbert-base-uncased'
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.custom_embeddings = custom_embeddings  # Store the custom embeddings

    def embed_documents(self, texts: list) -> np.ndarray:
        """
        Generate embeddings for a list of documents.
        
        Parameters:
            texts (list): A list of documents to embed.

        Returns:
            np.ndarray: An array of embeddings for the documents.
        """
        if self.custom_embeddings is not None:
            # If custom embeddings are provided, return those instead
            return np.array(self.custom_embeddings)
        
        # Generate new embeddings using the model if no custom embeddings are available
        inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True)

        with torch.no_grad():
            outputs = self.model(**inputs)

        # Use the pooled output or the CLS token as the embedding
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token for sentence embedding
        return embeddings.numpy()  # Convert to NumPy array

    def embed_query(self, query: str) -> np.ndarray:
        """
        Generate an embedding for a single query.
        
        Parameters:
            query (str): The query to embed.

        Returns:
            np.ndarray: The embedding for the query.
        """
        # If custom embeddings are provided, generate embedding based on those
        if self.custom_embeddings is not None:
            # You might want to handle how to relate the query to your custom embeddings
            raise NotImplementedError("Custom query embeddings are not supported with provided custom embeddings.")
        
        # Generate a new embedding using the model
        inputs = self.tokenizer(query, return_tensors='pt', padding=True, truncation=True)

        with torch.no_grad():
            outputs = self.model(**inputs)

        # Use the pooled output or the CLS token as the embedding
        embedding = outputs.last_hidden_state[:, 0, :]  # CLS token for single query embedding
        return embedding.numpy()  # Convert to NumPy array
    
# Initialize the custom embeddings class
custom_embeddings = CustomHuggingFaceRagasEmbeddings(ragas_data)


ragas_embeddings = CustomHuggingFaceRagasEmbeddings(model_name=model_name, custom_embeddings=custom_embeddings)

# Define the evaluation metrics
metrics = [context_utilization]

#lets define a custom function of evaluate function
#def custom_evaluate(ragas_data, metrics, embeddings: BaseRagasEmbeddings):
"""
    Custom evaluation function that avoids using OpenAI API.
    
    Parameters:
        dataset: The dataset to evaluate.
        metrics: A list of metrics to evaluate.
        embeddings: A custom embedding model (subclass of BaseRagasEmbeddings).
        
    Returns:
        A dictionary of evaluation results.
    """
   # results = {}
    
    # Iterate over the metrics and evaluate
    #for metric in metrics:
        #try:
            # Make sure the metric is compatible with the custom embeddings
           # if isinstance(metric, ContextUtilization):  # Check for specific metric type
                #result = evaluate(ragas_data, metric, embeddings=embeddings)
        #        results[metric.name] = result
       # except Exception as e:
           # print(f"Error while evaluating metric {metric.name}: {e}")
    
  #  return results


# Run the evaluation
evaluation_report = evaluate(ragas_data, metrics=metrics, embeddings=custom_embeddings)
# Print the evaluation results
print("RAGAS Evaluation Report:")
print(evaluation_report)    





Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

evaluate function #1421

Load the ground truth data

Load the question and the answer and the chunks

Initialize an empty list to hold the new dataset

Iterate through the list and combine every two dictionaries

Assuming data_set is a list of dictionaries

Create the required structure for Dataset

Initialize the custom embeddings class

Define the evaluation metrics

results = {}

return results

Run the evaluation

Print the evaluation results

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

evaluate function #1421

Description

Load the ground truth data

Load the question and the answer and the chunks

Initialize an empty list to hold the new dataset

Iterate through the list and combine every two dictionaries

Assuming data_set is a list of dictionaries

Create the required structure for Dataset

Initialize the custom embeddings class

Define the evaluation metrics

results = {}

return results

Run the evaluation

Print the evaluation results

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions