Skip to content

Only getting one entry in "contexts" list while generating synthetic data using TestsetGeneratorΒ #2145

@zahidiqbalnbs

Description

@zahidiqbalnbs

Describe the bug
A clear and concise description of what the bug is.

Ragas version:
ragas==0.1.22

Code to Reproduce

import logging
from pathlib import Path
from typing import Optional, List
import sys

from llama_index.core import SimpleDirectoryReader, Document
from ragas.testset import TestsetGenerator
from llama_index.llms.litellm import LiteLLM
from llama_index.embeddings.litellm import LiteLLMEmbedding
from dotenv import load_dotenv
import pandas as pd


# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('testset_generator.log'),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)


class TestsetGeneratorConfig:
    """Configuration class for testset generation."""
    
    def __init__(self):
        load_dotenv()
        self.api_key = os.getenv("LITELLM_API_KEY", "")
        self.api_base = os.getenv("LITELLM_API", "")
        self.model_name = os.getenv("MODEL_NAME", "")
        self.embedding_model = os.getenv("EMBEDDING_MODEL", "")
        self.documents_dir = os.getenv("DOCUMENTS_DIR", "./abc")
        self.output_dir = os.getenv("OUTPUT_DIR", "./output")
        self.testset_size = int(os.getenv("TESTSET_SIZE", "10"))
        
    def validate(self) -> bool:
        """Validate that all required configuration is present."""
        required_fields = [
            ("LITELLM_API_KEY", self.api_key),
            ("LITELLM_API", self.api_base),
            ("MODEL_NAME", self.model_name),
            ("EMBEDDING_MODEL", self.embedding_model)
        ]
        
        missing_fields = [field for field, value in required_fields if not value]
        
        if missing_fields:
            logger.error(f"Missing required environment variables: {missing_fields}")
            return False
        return True


class DocumentLoader:
    """Handles document loading operations."""
    
    @staticmethod
    def load_documents(directory_path: str) -> Optional[List[Document]]:
        """
        Load documents from the specified directory.
        
        Args:
            directory_path: Path to the directory containing documents
            
        Returns:
            List of loaded documents or None if error occurs
        """
        try:
            if not os.path.exists(directory_path):
                logger.error(f"Directory does not exist: {directory_path}")
                return None
                
            if not os.listdir(directory_path):
                logger.error(f"Directory is empty: {directory_path}")
                return None
                
            reader = SimpleDirectoryReader(directory_path)
            documents = reader.load_data()
            
            if not documents:
                logger.error(f"No valid documents found in directory: {directory_path}")
                return None
            
            logger.info(f"Successfully loaded {len(documents)} documents from {directory_path}")
            return documents
            
        except Exception as e:
            logger.error(f"Error loading documents from {directory_path}: {str(e)}")
            return None


class TestsetGeneratorService:
    """Service class for generating testsets."""
    
    def __init__(self, config: TestsetGeneratorConfig):
        self.config = config
        self.llm = None
        self.embeddings = None
        self.generator = None
        
    def initialize_models(self) -> bool:
        """Initialize LLM and embedding models."""
        try:
            self.llm = LiteLLM(
                model=self.config.model_name,
                api_key=self.config.api_key,
                api_base=self.config.api_base
            )
            
            self.embeddings = LiteLLMEmbedding(
                api_key=self.config.api_key,
                api_base=self.config.api_base,
                model_name=self.config.embedding_model
            )
            
            self.generator = TestsetGenerator.from_llama_index(
                llm=self.llm,
                embedding_model=self.embeddings
            )
            
            logger.info("Successfully initialized models")
            return True
            
        except Exception as e:
            logger.error(f"Error initializing models: {str(e)}")
            return False
    
    def generate_testset(self, documents: List[Document]) -> Optional[pd.DataFrame]:
        """
        Generate testset from documents.
        
        Args:
            documents: List of documents to generate testset from
            
        Returns:
            DataFrame containing the generated testset or None if error occurs
        """
        try:
            if not documents:
                logger.error("No documents provided for testset generation")
                return None
                
            logger.info(f"Generating testset with size {self.config.testset_size}")
            testset = self.generator.generate_with_llamaindex_docs(
                documents, 
                testset_size=self.config.testset_size
            )
            
            df = testset.to_pandas()

            # Column mapping for renaming
            column_mapping = {
                'user_input': 'input',
                'reference_contexts': 'context',
                'reference': 'reference_output'
            }
            
            # Check if required columns exist
            missing_columns = [col for col in column_mapping.keys() if col not in df.columns]
            if missing_columns:
                logger.warning(f"Missing columns in testset: {missing_columns}")
            
            # Rename columns that exist
            df = df.rename(columns=column_mapping)
            
            # Select only the target columns that exist
            target_columns = ['input', 'context', 'reference_output']
            available_columns = [col for col in target_columns if col in df.columns]
            
            if not available_columns:
                logger.error("No target columns found in the testset")
                return None
            
            # Keep only the specified columns
            df = df[available_columns]
            
            
            logger.info(f"Successfully generated testset with {len(df)} entries")
            return df
            
        except Exception as e:
            logger.error(f"Error generating testset: {str(e)}")
            return None
    
    def save_testset(self, df: pd.DataFrame, filename: str = "testset_output.csv") -> bool:
        """
        Save testset to CSV file.
        
        Args:
            df: DataFrame to save
            filename: Output filename
            
        Returns:
            True if successful, False otherwise
        """
        try:
            # Create output directory if it doesn't exist
            output_path = Path(self.config.output_dir)
            output_path.mkdir(parents=True, exist_ok=True)
            
            file_path = output_path / filename
            df.to_csv(file_path, index=False)
            
            logger.info(f"Successfully saved testset to {file_path}")
            return True
            
        except Exception as e:
            logger.error(f"Error saving testset: {str(e)}")
            return False


def main():
    """Main function to run the testset generation process."""
    logger.info("Starting testset generation process")
    
    # Initialize configuration
    config = TestsetGeneratorConfig()
    if not config.validate():
        logger.error("Configuration validation failed")
        sys.exit(1)
    
    # Load documents
    documents = DocumentLoader.load_documents(config.documents_dir)
    if documents is None:
        logger.error("Failed to load documents")
        sys.exit(1)
    
    # Initialize testset generator service
    service = TestsetGeneratorService(config)
    if not service.initialize_models():
        logger.error("Failed to initialize models")
        sys.exit(1)
    
    # Generate testset
    testset_df = service.generate_testset(documents)
    if testset_df is None:
        logger.error("Failed to generate testset")
        sys.exit(1)
    
    # Save testset
    if not service.save_testset(testset_df):
        logger.error("Failed to save testset")
        sys.exit(1)
    
    logger.info("Testset generation completed successfully")


if __name__ == "__main__":
    main()```
`

**Error trace**

**Expected behavior**
Should contains 3 contexts or configurable atleast. Always providing one value in "contexts" list which is strange.


Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingmodule-testsetgenModule testset generation

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions