Only getting one entry in "contexts" list while generating synthetic data using TestsetGenerator

**Describe the bug**
A clear and concise description of what the bug is.

Ragas version:
ragas==0.1.22

**Code to Reproduce**
```import os
import logging
from pathlib import Path
from typing import Optional, List
import sys

from llama_index.core import SimpleDirectoryReader, Document
from ragas.testset import TestsetGenerator
from llama_index.llms.litellm import LiteLLM
from llama_index.embeddings.litellm import LiteLLMEmbedding
from dotenv import load_dotenv
import pandas as pd


# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('testset_generator.log'),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)


class TestsetGeneratorConfig:
    """Configuration class for testset generation."""
    
    def __init__(self):
        load_dotenv()
        self.api_key = os.getenv("LITELLM_API_KEY", "")
        self.api_base = os.getenv("LITELLM_API", "")
        self.model_name = os.getenv("MODEL_NAME", "")
        self.embedding_model = os.getenv("EMBEDDING_MODEL", "")
        self.documents_dir = os.getenv("DOCUMENTS_DIR", "./abc")
        self.output_dir = os.getenv("OUTPUT_DIR", "./output")
        self.testset_size = int(os.getenv("TESTSET_SIZE", "10"))
        
    def validate(self) -> bool:
        """Validate that all required configuration is present."""
        required_fields = [
            ("LITELLM_API_KEY", self.api_key),
            ("LITELLM_API", self.api_base),
            ("MODEL_NAME", self.model_name),
            ("EMBEDDING_MODEL", self.embedding_model)
        ]
        
        missing_fields = [field for field, value in required_fields if not value]
        
        if missing_fields:
            logger.error(f"Missing required environment variables: {missing_fields}")
            return False
        return True


class DocumentLoader:
    """Handles document loading operations."""
    
    @staticmethod
    def load_documents(directory_path: str) -> Optional[List[Document]]:
        """
        Load documents from the specified directory.
        
        Args:
            directory_path: Path to the directory containing documents
            
        Returns:
            List of loaded documents or None if error occurs
        """
        try:
            if not os.path.exists(directory_path):
                logger.error(f"Directory does not exist: {directory_path}")
                return None
                
            if not os.listdir(directory_path):
                logger.error(f"Directory is empty: {directory_path}")
                return None
                
            reader = SimpleDirectoryReader(directory_path)
            documents = reader.load_data()
            
            if not documents:
                logger.error(f"No valid documents found in directory: {directory_path}")
                return None
            
            logger.info(f"Successfully loaded {len(documents)} documents from {directory_path}")
            return documents
            
        except Exception as e:
            logger.error(f"Error loading documents from {directory_path}: {str(e)}")
            return None


class TestsetGeneratorService:
    """Service class for generating testsets."""
    
    def __init__(self, config: TestsetGeneratorConfig):
        self.config = config
        self.llm = None
        self.embeddings = None
        self.generator = None
        
    def initialize_models(self) -> bool:
        """Initialize LLM and embedding models."""
        try:
            self.llm = LiteLLM(
                model=self.config.model_name,
                api_key=self.config.api_key,
                api_base=self.config.api_base
            )
            
            self.embeddings = LiteLLMEmbedding(
                api_key=self.config.api_key,
                api_base=self.config.api_base,
                model_name=self.config.embedding_model
            )
            
            self.generator = TestsetGenerator.from_llama_index(
                llm=self.llm,
                embedding_model=self.embeddings
            )
            
            logger.info("Successfully initialized models")
            return True
            
        except Exception as e:
            logger.error(f"Error initializing models: {str(e)}")
            return False
    
    def generate_testset(self, documents: List[Document]) -> Optional[pd.DataFrame]:
        """
        Generate testset from documents.
        
        Args:
            documents: List of documents to generate testset from
            
        Returns:
            DataFrame containing the generated testset or None if error occurs
        """
        try:
            if not documents:
                logger.error("No documents provided for testset generation")
                return None
                
            logger.info(f"Generating testset with size {self.config.testset_size}")
            testset = self.generator.generate_with_llamaindex_docs(
                documents, 
                testset_size=self.config.testset_size
            )
            
            df = testset.to_pandas()

            # Column mapping for renaming
            column_mapping = {
                'user_input': 'input',
                'reference_contexts': 'context',
                'reference': 'reference_output'
            }
            
            # Check if required columns exist
            missing_columns = [col for col in column_mapping.keys() if col not in df.columns]
            if missing_columns:
                logger.warning(f"Missing columns in testset: {missing_columns}")
            
            # Rename columns that exist
            df = df.rename(columns=column_mapping)
            
            # Select only the target columns that exist
            target_columns = ['input', 'context', 'reference_output']
            available_columns = [col for col in target_columns if col in df.columns]
            
            if not available_columns:
                logger.error("No target columns found in the testset")
                return None
            
            # Keep only the specified columns
            df = df[available_columns]
            
            
            logger.info(f"Successfully generated testset with {len(df)} entries")
            return df
            
        except Exception as e:
            logger.error(f"Error generating testset: {str(e)}")
            return None
    
    def save_testset(self, df: pd.DataFrame, filename: str = "testset_output.csv") -> bool:
        """
        Save testset to CSV file.
        
        Args:
            df: DataFrame to save
            filename: Output filename
            
        Returns:
            True if successful, False otherwise
        """
        try:
            # Create output directory if it doesn't exist
            output_path = Path(self.config.output_dir)
            output_path.mkdir(parents=True, exist_ok=True)
            
            file_path = output_path / filename
            df.to_csv(file_path, index=False)
            
            logger.info(f"Successfully saved testset to {file_path}")
            return True
            
        except Exception as e:
            logger.error(f"Error saving testset: {str(e)}")
            return False


def main():
    """Main function to run the testset generation process."""
    logger.info("Starting testset generation process")
    
    # Initialize configuration
    config = TestsetGeneratorConfig()
    if not config.validate():
        logger.error("Configuration validation failed")
        sys.exit(1)
    
    # Load documents
    documents = DocumentLoader.load_documents(config.documents_dir)
    if documents is None:
        logger.error("Failed to load documents")
        sys.exit(1)
    
    # Initialize testset generator service
    service = TestsetGeneratorService(config)
    if not service.initialize_models():
        logger.error("Failed to initialize models")
        sys.exit(1)
    
    # Generate testset
    testset_df = service.generate_testset(documents)
    if testset_df is None:
        logger.error("Failed to generate testset")
        sys.exit(1)
    
    # Save testset
    if not service.save_testset(testset_df):
        logger.error("Failed to save testset")
        sys.exit(1)
    
    logger.info("Testset generation completed successfully")


if __name__ == "__main__":
    main()```
`

**Error trace**

**Expected behavior**
Should contains 3 contexts or configurable atleast. Always providing one value in "contexts" list which is strange.




Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Only getting one entry in "contexts" list while generating synthetic data using TestsetGenerator #2145

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Only getting one entry in "contexts" list while generating synthetic data using TestsetGenerator #2145

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions