Added comments to understand code

hmumtazz · hmumtazz · commit b6ad90b1d232 · 2024-11-21T00:21:29.000-08:00
Signed-off-by: hmumtazz &lt;hashimkmumtaz@gmail.com&gt;
diff --git a/opensearch_py_ml/ml_commons/rag_pipeline/rag/config.ini b/opensearch_py_ml/ml_commons/rag_pipeline/rag/config.ini
@@ -0,0 +1,11 @@
+[DEFAULT]
+region = us-west-2
+iam_principal = arn:aws:iam::615299771255:user/hmumtazz
+index_name = drpepper
+collection_name = 
+is_serverless = False
+opensearch_endpoint = https://search-hashim-test5-eivrlyacr3n653fnkkrg2yab7u.aos.us-west-2.on.aws
+opensearch_username = admin
+opensearch_password = MyPassword123!
+embedding_dimension = 768
+space_type = l2
diff --git a/opensearch_py_ml/ml_commons/rag_pipeline/rag/ingest.py b/opensearch_py_ml/ml_commons/rag_pipeline/rag/ingest.py
@@ -46,13 +46,16 @@ class Ingest:
     EMBEDDING_MODEL_ID = 'amazon.titan-embed-text-v2:0'
 
     def __init__(self, config):
+        # Initialize the Ingest class with configuration
         self.config = config
         self.aws_region = config.get('region')
         self.index_name = config.get('index_name')
         self.bedrock_client = None
         self.opensearch = OpenSearchConnector(config)
 
     def initialize_clients(self):
+        # Initialize AWS Bedrock and OpenSearch clients
+        # Returns True if successful, False otherwise
         try:
             self.bedrock_client = boto3.client('bedrock-runtime', region_name=self.aws_region)
             if self.opensearch.initialize_opensearch_client():
@@ -66,6 +69,9 @@ def initialize_clients(self):
             return False
 
     def process_file(self, file_path: str) -> List[Dict[str, str]]:
+        # Process a file based on its extension
+        # Supports CSV, TXT, and PDF files
+        # Returns a list of dictionaries containing extracted text
         _, file_extension = os.path.splitext(file_path)
         
         if file_extension.lower() == '.csv':
@@ -79,6 +85,9 @@ def process_file(self, file_path: str) -> List[Dict[str, str]]:
             return []
 
     def process_csv(self, file_path: str) -> List[Dict[str, str]]:
+        # Process a CSV file
+        # Extracts information and formats it into a sentence
+        # Returns a list of dictionaries with the formatted text
         documents = []
         with open(file_path, 'r') as csvfile:
             reader = csv.DictReader(csvfile)
@@ -90,11 +99,17 @@ def process_csv(self, file_path: str) -> List[Dict[str, str]]:
         return documents
 
     def process_txt(self, file_path: str) -> List[Dict[str, str]]:
+        # Process a TXT file
+        # Reads the entire content of the file
+        # Returns a list with a single dictionary containing the file content
         with open(file_path, 'r') as txtfile:
             content = txtfile.read()
         return [{"text": content}]
 
     def process_pdf(self, file_path: str) -> List[Dict[str, str]]:
+        # Process a PDF file
+        # Extracts text from each page of the PDF
+        # Returns a list of dictionaries, each containing text from a page
         documents = []
         with open(file_path, 'rb') as pdffile:
             pdf_reader = PyPDF2.PdfReader(pdffile)
@@ -105,6 +120,9 @@ def process_pdf(self, file_path: str) -> List[Dict[str, str]]:
         return documents
 
     def text_embedding(self, text, max_retries=5, initial_delay=1, backoff_factor=2):
+        # Generate text embedding using AWS Bedrock
+        # Implements exponential backoff for retries in case of failures
+        # Returns the embedding if successful, None otherwise
         if self.bedrock_client is None:
             print("Bedrock client is not initialized. Please run setup first.")
             return None
@@ -139,6 +157,9 @@ def text_embedding(self, text, max_retries=5, initial_delay=1, backoff_factor=2)
         return None
 
     def process_and_ingest_data(self, file_paths: List[str]):
+        # Process and ingest data from multiple files
+        # Generates embeddings for each document and ingests into OpenSearch
+        # Displays progress and results of the ingestion process
         if not self.initialize_clients():
             print("Failed to initialize clients. Aborting ingestion.")
             return
@@ -197,6 +218,8 @@ def process_and_ingest_data(self, file_paths: List[str]):
         print(f"{Fore.RED}Failed to ingest {failed} documents.{Style.RESET_ALL}")
 
     def ingest_command(self, paths: List[str]):
+        # Main ingestion command
+        # Processes all valid files in the given paths and initiates ingestion
         all_files = []
         for path in paths:
             if os.path.isfile(path):
@@ -215,4 +238,4 @@ def ingest_command(self, paths: List[str]):
         
         print(f"{Fore.GREEN}Found {len(valid_files)} valid files for ingestion.{Style.RESET_ALL}")
         
-        self.process_and_ingest_data(valid_files)
+        self.process_and_ingest_data(valid_files)
diff --git a/opensearch_py_ml/ml_commons/rag_pipeline/rag/opensearch_connector.py b/opensearch_py_ml/ml_commons/rag_pipeline/rag/opensearch_connector.py
@@ -30,6 +30,7 @@
 
 class OpenSearchConnector:
     def __init__(self, config):
+        # Initialize the OpenSearchConnector with configuration
         self.config = config
         self.opensearch_client = None
         self.aws_region = config.get('region')
@@ -40,6 +41,9 @@ def __init__(self, config):
         self.opensearch_password = config.get('opensearch_password')
 
     def initialize_opensearch_client(self):
+        # Initialize the OpenSearch client
+        # Handles both serverless and non-serverless configurations
+        # Returns True if successful, False otherwise
         if not self.opensearch_endpoint:
             print("OpenSearch endpoint not set. Please run setup first.")
             return False
@@ -73,6 +77,8 @@ def initialize_opensearch_client(self):
             return False
 
     def create_index(self, embedding_dimension, space_type):
+        # Create a new KNN index in OpenSearch
+        # Sets up the mapping for nominee_text and nominee_vector fields
         index_body = {
             "mappings": {
                 "properties": {
@@ -107,6 +113,8 @@ def create_index(self, embedding_dimension, space_type):
                 print(f"Error creating index '{self.index_name}': {e}")
 
     def verify_and_create_index(self, embedding_dimension, space_type):
+        # Check if the index exists, create it if it doesn't
+        # Returns True if the index exists or was successfully created, False otherwise
         try:
             index_exists = self.opensearch_client.indices.exists(index=self.index_name)
             if index_exists:
@@ -119,6 +127,8 @@ def verify_and_create_index(self, embedding_dimension, space_type):
             return False
 
     def bulk_index(self, actions):
+        # Perform bulk indexing of documents
+        # Returns the number of successfully indexed documents and the number of failures
         try:
             success_count, error_info = opensearch_helpers.bulk(self.opensearch_client, actions)
             error_count = len(error_info)
@@ -129,6 +139,8 @@ def bulk_index(self, actions):
             return 0, len(actions)
 
     def search(self, vector, k=5):
+        # Perform a KNN search using the provided vector
+        # Returns the top k matching documents
         try:
             response = self.opensearch_client.search(
                 index=self.index_name,
diff --git a/opensearch_py_ml/ml_commons/rag_pipeline/rag/query.py b/opensearch_py_ml/ml_commons/rag_pipeline/rag/query.py
@@ -40,13 +40,16 @@ class Query:
     LLM_MODEL_ID = 'amazon.titan-text-express-v1'
 
     def __init__(self, config):
+        # Initialize the Query class with configuration
         self.config = config
         self.aws_region = config.get('region')
         self.index_name = config.get('index_name')
         self.bedrock_client = None
         self.opensearch = OpenSearchConnector(config)
 
     def initialize_clients(self):
+        # Initialize AWS Bedrock and OpenSearch clients
+        # Returns True if successful, False otherwise
         try:
             self.bedrock_client = boto3.client('bedrock-runtime', region_name=self.aws_region)
             if self.opensearch.initialize_opensearch_client():
@@ -60,6 +63,9 @@ def initialize_clients(self):
             return False
 
     def text_embedding(self, text, max_retries=5, initial_delay=1, backoff_factor=2):
+        # Generate text embedding using AWS Bedrock
+        # Implements exponential backoff for retries in case of failures
+        # Returns the embedding if successful, None otherwise
         if self.bedrock_client is None:
             print("Bedrock client is not initialized. Please run setup first.")
             return None
@@ -94,6 +100,9 @@ def text_embedding(self, text, max_retries=5, initial_delay=1, backoff_factor=2)
         return None
 
     def bulk_query(self, queries, k=5):
+        # Perform bulk semantic search for multiple queries
+        # Generates embeddings for queries and searches OpenSearch index
+        # Returns a list of results containing query, context, and number of results
         print("Generating embeddings for queries...")
         query_vectors = []
         for query in queries:
@@ -133,6 +142,9 @@ def bulk_query(self, queries, k=5):
         return results
 
     def generate_answer(self, prompt, config):
+        # Generate an answer using the LLM model
+        # Handles token limit and configures LLM parameters
+        # Returns the generated answer or None if an error occurs
         try:
             max_input_tokens = 8192  # Max tokens for the model
             expected_output_tokens = config.get('maxTokenCount', 1000)
@@ -172,6 +184,9 @@ def generate_answer(self, prompt, config):
             return None
 
     def query_command(self, queries: List[str], num_results=5):
+        # Main query command to process multiple queries
+        # Performs semantic search and generates answers using LLM
+        # Prints results for each query
         if not self.initialize_clients():
             print("Failed to initialize clients. Aborting query.")
             return
diff --git a/opensearch_py_ml/ml_commons/rag_pipeline/rag/rag.py b/opensearch_py_ml/ml_commons/rag_pipeline/rag/rag.py
@@ -38,17 +38,20 @@
 CONFIG_FILE = 'config.ini'
 
 def load_config():
+    # Load configuration from the config file
     config = configparser.ConfigParser()
     config.read(CONFIG_FILE)
     return config['DEFAULT']
 
 def save_config(config):
+    # Save configuration to the config file
     parser = configparser.ConfigParser()
     parser['DEFAULT'] = config
     with open(CONFIG_FILE, 'w') as f:
         parser.write(f)
     
 def main():
+    # Set up argument parser for CLI
     parser = argparse.ArgumentParser(description="RAG Pipeline CLI")
     parser.add_argument('command', choices=['setup', 'ingest', 'query'], help='Command to run')
     parser.add_argument('--paths', nargs='+', help='Paths to files or directories for ingestion')
@@ -57,14 +60,18 @@ def main():
 
     args = parser.parse_args()
 
+    # Load existing configuration
     config = load_config()
 
     if args.command == 'setup':
+        # Run setup process
         setup = Setup()
         setup.setup_command()
         save_config(setup.config)
     elif args.command == 'ingest':
+        # Handle ingestion command
         if not args.paths:
+            # If no paths provided as arguments, prompt user for input
             paths = []
             while True:
                 path = input("Enter a file or directory path (or press Enter to finish): ")
@@ -76,7 +83,9 @@ def main():
         ingest = Ingest(config)
         ingest.ingest_command(paths)
     elif args.command == 'query':
+        # Handle query command
         if not args.queries:
+            # If no queries provided as arguments, prompt user for input
             queries = []
             while True:
                 query = input("Enter a query (or press Enter to finish): ")
@@ -88,7 +97,8 @@ def main():
         query = Query(config)
         query.query_command(queries, num_results=args.num_results)
     else:
+        # If an invalid command is provided, print help
         parser.print_help()
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/opensearch_py_ml/ml_commons/rag_pipeline/rag/rag_setup.py b/opensearch_py_ml/ml_commons/rag_pipeline/rag/rag_setup.py
diff --git a/opensearch_py_ml/ml_commons/rag_pipeline/rag/setup.py b/opensearch_py_ml/ml_commons/rag_pipeline/rag/setup.py