Skip to content

Metadata column is missing in generated test set [ragas==0.3.0]Β #2125

@zahidiqbalnbs

Description

@zahidiqbalnbs

Describe the bug
I am not getting 'metadata' column for which source, question is related to

Ragas version:
Python version:
3.12

Code to Reproduce
Share code to reproduce the issue

import os
import logging
from pathlib import Path
from typing import Optional, List
import sys

from llama_index.core import SimpleDirectoryReader, Document
from ragas.testset import TestsetGenerator
from llama_index.llms.litellm import LiteLLM
from llama_index.embeddings.litellm import LiteLLMEmbedding
from dotenv import load_dotenv
import pandas as pd


# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('testset_generator.log'),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)


class TestsetGeneratorConfig:
    """Configuration class for testset generation."""
    
    def __init__(self):
        load_dotenv()
        self.api_key = os.getenv("LITELLM_API_KEY", "")
        self.api_base = os.getenv("LITELLM_API", "")
        self.model_name = os.getenv("MODEL_NAME", "")
        self.embedding_model = os.getenv("EMBEDDING_MODEL", "")
        self.documents_dir = os.getenv("DOCUMENTS_DIR", "./abc")
        self.output_dir = os.getenv("OUTPUT_DIR", "./output")
        self.testset_size = int(os.getenv("TESTSET_SIZE", "10"))
        
    def validate(self) -> bool:
        """Validate that all required configuration is present."""
        required_fields = [
            ("LITELLM_API_KEY", self.api_key),
            ("LITELLM_API", self.api_base),
            ("MODEL_NAME", self.model_name),
            ("EMBEDDING_MODEL", self.embedding_model)
        ]
        
        missing_fields = [field for field, value in required_fields if not value]
        
        if missing_fields:
            logger.error(f"Missing required environment variables: {missing_fields}")
            return False
        return True


class DocumentLoader:
    """Handles document loading operations."""
    
    @staticmethod
    def load_documents(directory_path: str) -> Optional[List[Document]]:
        """
        Load documents from the specified directory.
        
        Args:
            directory_path: Path to the directory containing documents
            
        Returns:
            List of loaded documents or None if error occurs
        """
        try:
            if not os.path.exists(directory_path):
                logger.error(f"Directory does not exist: {directory_path}")
                return None
                
            if not os.listdir(directory_path):
                logger.error(f"Directory is empty: {directory_path}")
                return None
                
            reader = SimpleDirectoryReader(directory_path)
            documents = reader.load_data()
            
            if not documents:
                logger.error(f"No valid documents found in directory: {directory_path}")
                return None
            
            logger.info(f"Successfully loaded {len(documents)} documents from {directory_path}")
            return documents
            
        except Exception as e:
            logger.error(f"Error loading documents from {directory_path}: {str(e)}")
            return None


class TestsetGeneratorService:
    """Service class for generating testsets."""
    
    def __init__(self, config: TestsetGeneratorConfig):
        self.config = config
        self.llm = None
        self.embeddings = None
        self.generator = None
        
    def initialize_models(self) -> bool:
        """Initialize LLM and embedding models."""
        try:
            self.llm = LiteLLM(
                model=self.config.model_name,
                api_key=self.config.api_key,
                api_base=self.config.api_base
            )
            
            self.embeddings = LiteLLMEmbedding(
                api_key=self.config.api_key,
                api_base=self.config.api_base,
                model_name=self.config.embedding_model
            )
            
            self.generator = TestsetGenerator.from_llama_index(
                llm=self.llm,
                embedding_model=self.embeddings
            )
            
            logger.info("Successfully initialized models")
            return True
            
        except Exception as e:
            logger.error(f"Error initializing models: {str(e)}")
            return False
    
    def generate_testset(self, documents: List[Document]) -> Optional[pd.DataFrame]:
        """
        Generate testset from documents.
        
        Args:
            documents: List of documents to generate testset from
            
        Returns:
            DataFrame containing the generated testset or None if error occurs
        """
        try:
            if not documents:
                logger.error("No documents provided for testset generation")
                return None
                
            logger.info(f"Generating testset with size {self.config.testset_size}")
            testset = self.generator.generate_with_llamaindex_docs(
                documents, 
                testset_size=self.config.testset_size
            )
            
            df = testset.to_pandas()

            # Column mapping for renaming
            column_mapping = {
                'user_input': 'input',
                'reference_contexts': 'context',
                'reference': 'reference_output'
            }
            
            # Check if required columns exist
            missing_columns = [col for col in column_mapping.keys() if col not in df.columns]
            if missing_columns:
                logger.warning(f"Missing columns in testset: {missing_columns}")
            
            # Rename columns that exist
            df = df.rename(columns=column_mapping)
            
            # Select only the target columns that exist
            target_columns = ['input', 'context', 'reference_output']
            available_columns = [col for col in target_columns if col in df.columns]
            
            if not available_columns:
                logger.error("No target columns found in the testset")
                return None
            
            # Keep only the specified columns
            df = df[available_columns]
            
            
            logger.info(f"Successfully generated testset with {len(df)} entries")
            return df
            
        except Exception as e:
            logger.error(f"Error generating testset: {str(e)}")
            return None
    
    def save_testset(self, df: pd.DataFrame, filename: str = "testset_output.csv") -> bool:
        """
        Save testset to CSV file.
        
        Args:
            df: DataFrame to save
            filename: Output filename
            
        Returns:
            True if successful, False otherwise
        """
        try:
            # Create output directory if it doesn't exist
            output_path = Path(self.config.output_dir)
            output_path.mkdir(parents=True, exist_ok=True)
            
            file_path = output_path / filename
            df.to_csv(file_path, index=False)
            
            logger.info(f"Successfully saved testset to {file_path}")
            return True
            
        except Exception as e:
            logger.error(f"Error saving testset: {str(e)}")
            return False


def main():
    """Main function to run the testset generation process."""
    logger.info("Starting testset generation process")
    
    # Initialize configuration
    config = TestsetGeneratorConfig()
    if not config.validate():
        logger.error("Configuration validation failed")
        sys.exit(1)
    
    # Load documents
    documents = DocumentLoader.load_documents(config.documents_dir)
    if documents is None:
        logger.error("Failed to load documents")
        sys.exit(1)
    
    # Initialize testset generator service
    service = TestsetGeneratorService(config)
    if not service.initialize_models():
        logger.error("Failed to initialize models")
        sys.exit(1)
    
    # Generate testset
    testset_df = service.generate_testset(documents)
    if testset_df is None:
        logger.error("Failed to generate testset")
        sys.exit(1)
    
    # Save testset
    if not service.save_testset(testset_df):
        logger.error("Failed to save testset")
        sys.exit(1)
    
    logger.info("Testset generation completed successfully")


if __name__ == "__main__":
    main()

Error trace
No Error

Expected behavior
Should contains metadata column including file namein output

Additional context
Add any other context about the problem here.

Metadata

Metadata

Assignees

No one assigned

    Labels

    answeredπŸ€– The question has been answered. Will be closed automatically if no new commentsbugSomething isn't workingmodule-testsetgenModule testset generation

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions