From 6c37c7107e8ec53c5cba0f822649947e791178c0 Mon Sep 17 00:00:00 2001
From: sankalp1999 <45618047+sankalp1999@users.noreply.github.com>
Date: Wed, 13 Nov 2024 03:20:02 +0530
Subject: [PATCH 01/12] batching + gpt4omini

---
 app.py | 158 ++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 129 insertions(+), 29 deletions(-)

diff --git a/app.py b/app.py
index fb2c885..fb261a9 100644
--- a/app.py
+++ b/app.py
@@ -12,6 +12,8 @@
 import json
 from dotenv import load_dotenv
 from redis import ConnectionPool
+import time
+from concurrent.futures import ThreadPoolExecutor
 
 load_dotenv()
 
@@ -35,14 +37,29 @@
 
 # Logging setup
 def setup_logging(config):
-    logging.basicConfig(
-        filename=config['LOG_FILE'],
-        level=logging.INFO,
-        format=config['LOG_FORMAT'],
+    # Create a formatter
+    formatter = logging.Formatter(
+        config['LOG_FORMAT'],
         datefmt=config['LOG_DATE_FORMAT']
     )
-    # Return a logger instance
-    return logging.getLogger(__name__)
+    
+    # Setup file handler
+    file_handler = logging.FileHandler(config['LOG_FILE'])
+    file_handler.setFormatter(formatter)
+    
+    # Setup console handler
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setFormatter(formatter)
+    
+    # Get the logger
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.INFO)
+    
+    # Add both handlers
+    logger.addHandler(file_handler)
+    logger.addHandler(console_handler)
+    
+    return logger
 
 # Database setup
 def setup_database(codebase_path):
@@ -129,8 +146,10 @@ def openai_hyde_v2(query, temp_context, hyde_query):
 
 
 def openai_chat(query, context):
+    start_time = time.time()
+    
     chat_completion = client.chat.completions.create(
-        model="gpt-4o",
+        model="gpt-4o-mini",
         messages=[
             {
                 "role": "system",
@@ -142,6 +161,10 @@ def openai_chat(query, context):
             }
         ]
     )
+    
+    chat_time = time.time() - start_time
+    app.logger.info(f"Chat response took: {chat_time:.2f} seconds")
+    
     return chat_completion.choices[0].message.content
 
 def process_input(input_text):
@@ -152,47 +175,98 @@ def process_input(input_text):
     return processed_text
 
 def generate_context(query, rerank=False):
+    start_time = time.time()
+    
+    # First HYDE call
     hyde_query = openai_hyde(query)
+    hyde_time = time.time()
+    app.logger.info(f"First HYDE call took: {hyde_time - start_time:.2f} seconds")
+
+    # Concurrent execution of first database searches
+    def search_method_table():
+        return method_table.search(hyde_query).limit(5).to_pandas()
+
+    def search_class_table():
+        return class_table.search(hyde_query).limit(5).to_pandas()
 
-    method_docs = method_table.search(hyde_query).limit(5).to_pandas()
-    class_docs = class_table.search(hyde_query).limit(5).to_pandas()
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        future_method_docs = executor.submit(search_method_table)
+        future_class_docs = executor.submit(search_class_table)
+        method_docs = future_method_docs.result()
+        class_docs = future_class_docs.result()
 
-    temp_context = '\n'.join(method_docs['code'] + '\n'.join(class_docs['source_code']) )
+    first_search_time = time.time()
+    app.logger.info(f"First DB search took: {first_search_time - hyde_time:.2f} seconds")
 
+    temp_context = '\n'.join(method_docs['code'].tolist() + class_docs['source_code'].tolist())
+
+    # Second HYDE call
     hyde_query_v2 = openai_hyde_v2(query, temp_context, hyde_query)
+    second_hyde_time = time.time()
+    app.logger.info(f"Second HYDE call took: {second_hyde_time - first_search_time:.2f} seconds")
+
+    # Concurrent execution of second database searches
+    def search_method_table_v2():
+        return method_table.search(hyde_query_v2)
+
+    def search_class_table_v2():
+        return class_table.search(hyde_query_v2)
 
-    logging.info("-query_v2-")
-    logging.info(hyde_query_v2)
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        future_method_search = executor.submit(search_method_table_v2)
+        future_class_search = executor.submit(search_class_table_v2)
+        method_search = future_method_search.result()
+        class_search = future_class_search.result()
 
-    method_search = method_table.search(hyde_query_v2)
-    class_search = class_table.search(hyde_query_v2)
+    search_time = time.time()
+    app.logger.info(f"Second DB search took: {search_time - second_hyde_time:.2f} seconds")
 
+    # Concurrent reranking if enabled
+    app.logger.info(f"Reranking enabled: {rerank}")
     if rerank:
-        method_search = method_search.rerank(reranker)
-        class_search = class_search.rerank(reranker)
+        rerank_start_time = time.time()  # Start timing before reranking
+        
+        def rerank_method_search():
+            return method_search.rerank(reranker)
+
+        def rerank_class_search():
+            return class_search.rerank(reranker)
+
+        with ThreadPoolExecutor(max_workers=2) as executor:
+            future_method_search = executor.submit(rerank_method_search)
+            future_class_search = executor.submit(rerank_class_search)
+            method_search = future_method_search.result()
+            class_search = future_class_search.result()
+
+        rerank_time = time.time()
+        app.logger.info(f"Reranking took: {rerank_time - rerank_start_time:.2f} seconds")
+    
+    # Set final time reference point
+    rerank_time = time.time() if rerank else search_time
 
+    # Fetch top documents
     method_docs = method_search.limit(5).to_list()
     class_docs = class_search.limit(5).to_list()
+    final_search_time = time.time()
+    app.logger.info(f"Final DB search took: {final_search_time - rerank_time:.2f} seconds")
 
+    # Combine documents
     top_3_methods = method_docs[:3]
-    methods_combined = "\n\n".join(f"File: {doc['file_path']}\nCode:\n{doc['code']}" for doc in top_3_methods)
+    methods_combined = "\n\n".join(
+        f"File: {doc['file_path']}\nCode:\n{doc['code']}" for doc in top_3_methods
+    )
 
     top_3_classes = class_docs[:3]
-    classes_combined = "\n\n".join(f"File: {doc['file_path']}\nClass Info:\n{doc['source_code']} References: \n{doc['references']}  \n END OF ROW {i}" for i, doc in enumerate(top_3_classes))
-
-    app.logger.info("Classes Combined:")
-    app.logger.info("-" * 40)
-    app.logger.info(classes_combined)
-    app.logger.info(f"Length of classes_combined: {len(classes_combined)}")
-    app.logger.info("-" * 40)
-
-    app.logger.info("Methods Combined:")
-    app.logger.info("-" * 40)
-    app.logger.info(methods_combined)
-    app.logger.info("-" * 40)
+    classes_combined = "\n\n".join(
+        f"File: {doc['file_path']}\nClass Info:\n{doc['source_code']} References: \n{doc['references']}  \n END OF ROW {i}"
+        for i, doc in enumerate(top_3_classes)
+    )
 
     app.logger.info("Context generation complete.")
 
+    total_time = time.time() - start_time
+    app.logger.info(f"Total context generation took: {total_time:.2f} seconds")
+    
     return methods_combined + "\n below is class or constructor related code \n" + classes_combined
 
 @app.route('/', methods=['GET', 'POST'])
@@ -257,4 +331,30 @@ def home():
     # Setup database
     method_table, class_table = setup_database(codebase_path)
     
+    app.logger.info("Server starting up...")  # Test log message
     app.run(host='0.0.0.0', port=5001)
+
+
+# 127.0.0.1 - - [13/Nov/2024 02:45:06] "GET / HTTP/1.1" 200 -
+# 13-Nov-24 02:45:21 - First HYDE call took: 3.05 seconds
+# 13-Nov-24 02:45:23 - First DB search took: 2.36 seconds
+# 13-Nov-24 02:45:34 - Second HYDE call took: 10.82 seconds
+# 13-Nov-24 02:45:36 - Reranking took: 2.44 seconds
+# 13-Nov-24 02:45:37 - Second DB search took: 0.65 seconds
+# 13-Nov-24 02:45:37 - Context generation complete.
+# 13-Nov-24 02:45:37 - Total context generation took: 19.32 seconds
+# 13-Nov-24 02:45:37 - Generated context for query with @codebase.
+# 13-Nov-24 02:46:00 - Chat response took: 23.01 seconds
+
+
+# 127.0.0.1 - - [13/Nov/2024 03:01:37] "POST / HTTP/1.1" 200 -
+# 13-Nov-24 03:01:54 - First HYDE call took: 3.18 seconds
+# 13-Nov-24 03:01:55 - First DB search took: 1.28 seconds
+# 13-Nov-24 03:02:02 - Second HYDE call took: 6.87 seconds
+# 13-Nov-24 03:02:03 - Second DB search took: 0.85 seconds
+# 13-Nov-24 03:02:03 - Reranking took: 0.00 seconds
+# 13-Nov-24 03:02:04 - Final DB search took: 0.68 seconds
+# 13-Nov-24 03:02:04 - Context generation complete.
+# 13-Nov-24 03:02:04 - Total context generation took: 12.86 seconds
+# 13-Nov-24 03:02:04 - Generated context for query with @codebase.
+# 13-Nov-24 03:02:26 - Chat response took: 22.19 seconds
\ No newline at end of file

From d63b35af3517055e387bc53fb7071ec41fdf83aa Mon Sep 17 00:00:00 2001
From: sankalp1999 <45618047+sankalp1999@users.noreply.github.com>
Date: Wed, 13 Nov 2024 03:40:41 +0530
Subject: [PATCH 02/12] sambanova does 2x speedup

---
 app.py | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/app.py b/app.py
index fb261a9..7f5e7f5 100644
--- a/app.py
+++ b/app.py
@@ -14,6 +14,7 @@
 from redis import ConnectionPool
 import time
 from concurrent.futures import ThreadPoolExecutor
+import openai
 
 load_dotenv()
 
@@ -105,7 +106,11 @@ def markdown_filter(text):
 app = setup_app()
 
 # OpenAI client setup
-client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+# client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+client = openai.OpenAI(
+    api_key=os.environ.get("SAMBANOVA_API_KEY"),
+    base_url="https://api.sambanova.ai/v1",
+)
 
 
 # Initialize the reranker
@@ -114,7 +119,8 @@ def markdown_filter(text):
 # Replace groq_hyde function
 def openai_hyde(query):
     chat_completion = client.chat.completions.create(
-        model="gpt-4o-mini",
+        # model="gpt-4o-mini",
+        model='Meta-Llama-3.1-70B-Instruct',
         messages=[
             {
                 "role": "system",
@@ -130,7 +136,8 @@ def openai_hyde(query):
 
 def openai_hyde_v2(query, temp_context, hyde_query):
     chat_completion = client.chat.completions.create(
-        model="gpt-4o-mini",
+        # model="gpt-4o-mini",
+        model='Meta-Llama-3.1-70B-Instruct',
         messages=[
             {
                 "role": "system",
@@ -149,7 +156,8 @@ def openai_chat(query, context):
     start_time = time.time()
     
     chat_completion = client.chat.completions.create(
-        model="gpt-4o-mini",
+        # model="gpt-4o-mini",
+        model='Meta-Llama-3.1-70B-Instruct',
         messages=[
             {
                 "role": "system",
@@ -335,6 +343,22 @@ def home():
     app.run(host='0.0.0.0', port=5001)
 
 
+# Main latency here is because of Context + LLM processing so need faster LLM
+
+
+# SambaNova halves the total effective time
+# 13-Nov-24 03:33:14 - First HYDE call took: 2.20 seconds
+# 13-Nov-24 03:33:15 - First DB search took: 1.44 seconds
+# 13-Nov-24 03:33:20 - Second HYDE call took: 4.91 seconds
+# 13-Nov-24 03:33:22 - Second DB search took: 1.53 seconds
+# 13-Nov-24 03:33:22 - Reranking enabled: True
+# 13-Nov-24 03:33:22 - Reranking took: 0.00 seconds
+# 13-Nov-24 03:33:22 - Final DB search took: 0.55 seconds
+# 13-Nov-24 03:33:22 - Context generation complete.
+# 13-Nov-24 03:33:22 - Total context generation took: 10.63 seconds
+# 13-Nov-24 03:33:22 - Generated context for query with @codebase.
+# 13-Nov-24 03:33:28 - Chat response took: 5.59 seconds
+
 # 127.0.0.1 - - [13/Nov/2024 02:45:06] "GET / HTTP/1.1" 200 -
 # 13-Nov-24 02:45:21 - First HYDE call took: 3.05 seconds
 # 13-Nov-24 02:45:23 - First DB search took: 2.36 seconds
@@ -357,4 +381,5 @@ def home():
 # 13-Nov-24 03:02:04 - Context generation complete.
 # 13-Nov-24 03:02:04 - Total context generation took: 12.86 seconds
 # 13-Nov-24 03:02:04 - Generated context for query with @codebase.
-# 13-Nov-24 03:02:26 - Chat response took: 22.19 seconds
\ No newline at end of file
+# 13-Nov-24 03:02:26 - Chat response took: 22.19 seconds
+

From 505e5556dd093647c832bb43d6f97b7a4d3e8dc3 Mon Sep 17 00:00:00 2001
From: sankalp1999 <45618047+sankalp1999@users.noreply.github.com>
Date: Wed, 13 Nov 2024 03:42:15 +0530
Subject: [PATCH 03/12] update readme

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index a2d223d..57ee923 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,10 @@ A powerful code search and query system that lets you explore codebases using na
 
 > **Note**: New OpenAI/Anthropic accounts may experience token rate limits. Consider using an established account.
 
+## Note:
+
+This branch uses SambaNova's API for faster LLM processing - 2x speed up over gpt4o-mini timings + batch processing for VDB queries
+
 ## What is CodeQA?
 
 CodeQA helps you understand codebases by:
@@ -67,7 +71,10 @@ Create a .env file and add the following:
 ```
 OPENAI_API_KEY="your-openai-api-key"
 JINA_API_KEY="your-jina-api-key"
+SAMBANOVA_API_KEY="your-sambanova-api-key"
 ```
+
+This branch uses SambaNova's API for faster LLM processing - 2x speed up over gpt4o-mini timings
 ## Building the Codebase Index
 
 To build the index for the codebase, run the following script:

From f0692c33e0293da26105cc7ca0d417ed70585334 Mon Sep 17 00:00:00 2001
From: sankalp1999 <45618047+sankalp1999@users.noreply.github.com>
Date: Wed, 13 Nov 2024 21:55:55 +0530
Subject: [PATCH 04/12] add small reranker to filter context, now runs below 20
 seconds

---
 app.py | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/app.py b/app.py
index 7f5e7f5..c0da010 100644
--- a/app.py
+++ b/app.py
@@ -21,7 +21,8 @@
 from prompts import (
     HYDE_SYSTEM_PROMPT,
     HYDE_V2_SYSTEM_PROMPT,
-    CHAT_SYSTEM_PROMPT  
+    CHAT_SYSTEM_PROMPT,
+    RERANK_PROMPT
 )
 
 # Configuration
@@ -175,6 +176,29 @@ def openai_chat(query, context):
     
     return chat_completion.choices[0].message.content
 
+def rerank_using_small_model(query, context):
+    start_time = time.time()
+    
+    chat_completion = client.chat.completions.create(
+        # model="gpt-4o-mini",
+        model='Meta-Llama-3.1-8B-Instruct',
+        messages=[
+            {
+                "role": "system",
+                "content": RERANK_PROMPT.format(context=context)
+            },
+            {
+                "role": "user",
+                "content": query,
+            }
+        ]
+    )
+    
+    chat_time = time.time() - start_time
+    app.logger.info(f"Llama 8B reranker response took: {chat_time:.2f} seconds")
+    
+    return chat_completion.choices[0].message.content
+
 def process_input(input_text):
     processed_text = input_text.replace('\n', ' ').replace('\t', ' ')
     processed_text = re.sub(r'\s+', ' ', processed_text)
@@ -270,12 +294,16 @@ def rerank_class_search():
         for i, doc in enumerate(top_3_classes)
     )
 
+    final_context = rerank_using_small_model(query, classes_combined + "\n" + methods_combined)
+
     app.logger.info("Context generation complete.")
 
     total_time = time.time() - start_time
     app.logger.info(f"Total context generation took: {total_time:.2f} seconds")
+    return final_context
+
     
-    return methods_combined + "\n below is class or constructor related code \n" + classes_combined
+    # return methods_combined + "\n below is class or constructor related code \n" + classes_combined
 
 @app.route('/', methods=['GET', 'POST'])
 def home():

From 130b750f5c2ef195e1d350b16a0b0a966604b385 Mon Sep 17 00:00:00 2001
From: sankalp1999 <45618047+sankalp1999@users.noreply.github.com>
Date: Thu, 14 Nov 2024 02:17:27 +0530
Subject: [PATCH 05/12] fix: reduce max tokens, switch to gpt4o-mini

---
 app.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/app.py b/app.py
index c0da010..024922e 100644
--- a/app.py
+++ b/app.py
@@ -107,7 +107,7 @@ def markdown_filter(text):
 app = setup_app()
 
 # OpenAI client setup
-# client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
 client = openai.OpenAI(
     api_key=os.environ.get("SAMBANOVA_API_KEY"),
     base_url="https://api.sambanova.ai/v1",
@@ -119,9 +119,9 @@ def markdown_filter(text):
 
 # Replace groq_hyde function
 def openai_hyde(query):
-    chat_completion = client.chat.completions.create(
-        # model="gpt-4o-mini",
-        model='Meta-Llama-3.1-70B-Instruct',
+    chat_completion = openai_client.chat.completions.create(
+        model="gpt-4o-mini",
+        max_tokens=512,
         messages=[
             {
                 "role": "system",
@@ -133,12 +133,13 @@ def openai_hyde(query):
             }
         ]
     )
+    app.logger.info(f"First HYDE response: {chat_completion.choices[0].message.content}")
     return chat_completion.choices[0].message.content
 
 def openai_hyde_v2(query, temp_context, hyde_query):
-    chat_completion = client.chat.completions.create(
-        # model="gpt-4o-mini",
-        model='Meta-Llama-3.1-70B-Instruct',
+    chat_completion = openai_client.chat.completions.create(
+        model="gpt-4o-mini",
+        max_tokens=1024,
         messages=[
             {
                 "role": "system",
@@ -150,6 +151,7 @@ def openai_hyde_v2(query, temp_context, hyde_query):
             }
         ]
     )
+    app.logger.info(f"Second HYDE response: {chat_completion.choices[0].message.content}")
     return chat_completion.choices[0].message.content
 
 
@@ -157,7 +159,6 @@ def openai_chat(query, context):
     start_time = time.time()
     
     chat_completion = client.chat.completions.create(
-        # model="gpt-4o-mini",
         model='Meta-Llama-3.1-70B-Instruct',
         messages=[
             {
@@ -180,7 +181,6 @@ def rerank_using_small_model(query, context):
     start_time = time.time()
     
     chat_completion = client.chat.completions.create(
-        # model="gpt-4o-mini",
         model='Meta-Llama-3.1-8B-Instruct',
         messages=[
             {

From 5fb3d0d8796b584359520d503f81d33db4b5d4a6 Mon Sep 17 00:00:00 2001
From: sankalp1999 <45618047+sankalp1999@users.noreply.github.com>
Date: Thu, 14 Nov 2024 02:53:30 +0530
Subject: [PATCH 06/12] update the rerank prompt

---
 prompts.py | 49 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/prompts.py b/prompts.py
index 976395a..149f7ab 100644
--- a/prompts.py
+++ b/prompts.py
@@ -9,13 +9,15 @@
 4. Include specific method names, class names, and key concepts in your response.
 5. If applicable, suggest modern libraries or best practices for the given task.
 6. You may guess the language based on the context provided.
+7. Is the query pointing out to README?
 
 Output format: 
 - Provide only the improved query or predicted code snippet.
 - Do not include any explanatory text outside the code.
 - Ensure the response is directly usable for further processing or execution.'''
 
-HYDE_V2_SYSTEM_PROMPT = '''You are an expert software engineer. Your task is to enhance the original query: {query} using the provided context: {temp_context}.
+HYDE_V2_SYSTEM_PROMPT = '''You are an expert software engineer. Your task is to enhance the original query : {query} using the provided context: {temp_context} such that 
+it's closer to the user's actual intention.
 
 Instructions:
 1. Analyze the query and context thoroughly.
@@ -30,19 +32,7 @@
 
 Output format: Provide only the enhanced query. Do not include any explanatory text or additional commentary.'''
 
-REFERENCES_SYSTEM_PROMPT = '''You are an expert software engineer. Given the <query>{query}</query> and <context>{context}</context>, your task is to enhance the query:
 
-1. Analyze the query and context thoroughly.
-2. Frame a concise, improved query using keywords from the context that are most relevant to answering the original query.
-3. Include specific code-related details such as method names, class names, and key programming concepts.
-4. If applicable, reference important files like README.md or configuration files.
-5. Add any crucial programming terminology or best practices that might be relevant.
-6. Ensure the enhanced query remains focused while being more descriptive and targeted.
-
-Output format:
-<query>Enhanced query here</query>
-
-Provide only the enhanced query within the tags. Do not include any explanatory text or additional commentary.'''
 
 CHAT_SYSTEM_PROMPT = '''You are an expert software engineer providing codebase assistance. Using the provided <context>{context}</context>:
 
@@ -77,3 +67,36 @@
    - Acknowledge limitations if context is insufficient
 
 If you need additional context or clarification, request it specifically.'''
+
+RERANK_PROMPT = '''You are a code context filtering expert. Your task is to analyze the following context and select the most relevant information for answering the query. Anything you
+think is relevant to the query should be included.
+
+Context to analyze:
+<context>
+{context}
+</context>
+
+Instructions:
+1. Analyze the query to understand the user's specific needs:
+   - If they request full code, preserve complete code blocks
+   - If they ask about specific methods/functions, focus on those implementations
+   - If they ask about architecture, prioritize class definitions and relationships
+
+2. From the provided context, select:
+   - Code segments that directly answer the query
+   - Supporting context that helps understand the implementation
+   - Related references that provide valuable context
+
+3. Filtering guidelines:
+   - Remove redundant or duplicate information
+   - Maintain code structure and readability
+   - Preserve file paths and important metadata
+   - Keep only the most relevant documentation
+
+4. Format requirements:
+   - Maintain original code formatting
+   - Keep file path references
+   - Preserve class/method relationships
+   - Return filtered context in the same structure as input
+
+Output format: Return only the filtered context, maintaining the original structure but including only the most relevant information for answering the query.'''

From 5dc6afe7cfa7002000377a36d35a5c4295c57ed5 Mon Sep 17 00:00:00 2001
From: sankalp1999 <45618047+sankalp1999@users.noreply.github.com>
Date: Thu, 14 Nov 2024 03:03:50 +0530
Subject: [PATCH 07/12] adjust max_tokens

---
 app.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/app.py b/app.py
index 024922e..9e6e693 100644
--- a/app.py
+++ b/app.py
@@ -121,7 +121,7 @@ def markdown_filter(text):
 def openai_hyde(query):
     chat_completion = openai_client.chat.completions.create(
         model="gpt-4o-mini",
-        max_tokens=512,
+        max_tokens=400,
         messages=[
             {
                 "role": "system",
@@ -139,7 +139,7 @@ def openai_hyde(query):
 def openai_hyde_v2(query, temp_context, hyde_query):
     chat_completion = openai_client.chat.completions.create(
         model="gpt-4o-mini",
-        max_tokens=1024,
+        max_tokens=768,
         messages=[
             {
                 "role": "system",

From 59eee7a33f223dac06830838d1345ae9a7485bac Mon Sep 17 00:00:00 2001
From: sankalp1999 <45618047+sankalp1999@users.noreply.github.com>
Date: Thu, 14 Nov 2024 03:08:28 +0530
Subject: [PATCH 08/12] faster

---
 README.md | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 57ee923..2f41c60 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,4 @@
 
-
 Blog Links:
 
 [An attempt to build cursor's @codebase feature - RAG on codebases - part 1](https://blog.lancedb.com/rag-codebase-1/)
@@ -10,10 +9,18 @@ A powerful code search and query system that lets you explore codebases using na
 
 > **Note**: New OpenAI/Anthropic accounts may experience token rate limits. Consider using an established account.
 
-## Note:
+# Optimized Branch
+
+The code on this branch runs in the 10-20 seconds range, roughly 2.5x faster than the main branch.
+Main changes:
+Reduced the max_token counts to 400 for HYDE and 768 for HYDE-v2
+This step saves roughly 5-10 seconds.
 
-This branch uses SambaNova's API for faster LLM processing - 2x speed up over gpt4o-mini timings + batch processing for VDB queries
+2. 
+a) Using SambaNova Llama 3.1 8b as a context filterer after fetching the final context + metadata from vector db 
+b) Using SambaNova Llama 3.1-70b instead of gpt4o for chat response 400 tok/s
 
+This step saves 20 seconds. 
 ## What is CodeQA?
 
 CodeQA helps you understand codebases by:

From 3d424d16b8498bb5639740c622dfb95045bc9f1e Mon Sep 17 00:00:00 2001
From: sankalp1999 <45618047+sankalp1999@users.noreply.github.com>
Date: Thu, 14 Nov 2024 03:10:52 +0530
Subject: [PATCH 09/12] update readme

---
 README.md | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 2f41c60..43f5aa1 100644
--- a/README.md
+++ b/README.md
@@ -13,13 +13,19 @@ A powerful code search and query system that lets you explore codebases using na
 
 The code on this branch runs in the 10-20 seconds range, roughly 2.5x faster than the main branch.
 Main changes:
-Reduced the max_token counts to 400 for HYDE and 768 for HYDE-v2
+
+Reduced the max_token counts to 400 for HYDE and 768 for HYDE-v2. This ensures that the hallucinated output is not too long thus saving time. We just
+need the minimal amount of relevant output that is close to our intended query and less than 512 tokens are more than enough (i.e ~300-400 words)
+
 This step saves roughly 5-10 seconds.
 
-2. 
+2. Context filtering + Faster inference
+
 a) Using SambaNova Llama 3.1 8b as a context filterer after fetching the final context + metadata from vector db 
 b) Using SambaNova Llama 3.1-70b instead of gpt4o for chat response 400 tok/s
 
+The above again try to reduce the amount of irrelevant context and process the relevant context as fast as possible.
+
 This step saves 20 seconds. 
 ## What is CodeQA?
 

From f873c11141e8954a0714c2872957f06e545c89fd Mon Sep 17 00:00:00 2001
From: sankalp1999 <45618047+sankalp1999@users.noreply.github.com>
Date: Fri, 15 Nov 2024 04:09:06 +0530
Subject: [PATCH 10/12] slightly more faster now

---
 app.py     |  8 +++++---
 prompts.py | 18 ++++++++++--------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/app.py b/app.py
index 9e6e693..79f041b 100644
--- a/app.py
+++ b/app.py
@@ -143,11 +143,11 @@ def openai_hyde_v2(query, temp_context, hyde_query):
         messages=[
             {
                 "role": "system",
-                "content": HYDE_V2_SYSTEM_PROMPT.format(query=query, temp_context=temp_context)
+                "content": HYDE_V2_SYSTEM_PROMPT.format(temp_context=temp_context)
             },
             {
                 "role": "user",
-                "content": f"Predict the answer to the query: {hyde_query}",
+                "content": f"Predict the answer to the query: {query}",
             }
         ]
     )
@@ -296,6 +296,8 @@ def rerank_class_search():
 
     final_context = rerank_using_small_model(query, classes_combined + "\n" + methods_combined)
 
+    app.logger.info(f"Final context: {final_context}")
+
     app.logger.info("Context generation complete.")
 
     total_time = time.time() - start_time
@@ -334,7 +336,7 @@ def home():
                     context = context.decode()
 
             # Now, apply reranking during the chat response if needed
-            response = openai_chat(query, context[:12000])  # Adjust as needed
+            response = openai_chat(query, context[:8192])  # Adjust as needed
 
             # Store the conversation history
             redis_key = f"user:{user_id}:responses"
diff --git a/prompts.py b/prompts.py
index 149f7ab..6d5e5d0 100644
--- a/prompts.py
+++ b/prompts.py
@@ -1,6 +1,6 @@
 # System prompts for different LLM interactions
 
-HYDE_SYSTEM_PROMPT = '''You are an expert software engineer. Your task is to predict code that answers the given query.
+HYDE_SYSTEM_PROMPT = '''You are an expert software engineer. Your task is to predict code that answers the user's query.
 
 Instructions:
 1. Analyze the query carefully.
@@ -8,16 +8,17 @@
 3. Generate concise, idiomatic code that addresses the query.
 4. Include specific method names, class names, and key concepts in your response.
 5. If applicable, suggest modern libraries or best practices for the given task.
-6. You may guess the language based on the context provided.
-7. Is the query pointing out to README?
+6. Is the query pointing out to README?
+7. You may guess the language based on the context provided.
 
 Output format: 
+- Use plain text only for the response. Delimiters only for code.
 - Provide only the improved query or predicted code snippet.
-- Do not include any explanatory text outside the code.
-- Ensure the response is directly usable for further processing or execution.'''
+- No additional commentary or explanation other than the code or text.
+'''
 
-HYDE_V2_SYSTEM_PROMPT = '''You are an expert software engineer. Your task is to enhance the original query : {query} using the provided context: {temp_context} such that 
-it's closer to the user's actual intention.
+HYDE_V2_SYSTEM_PROMPT = '''You are an expert software engineer. Your task is to answer the user's query using the provided <context> {temp_context} </context>. If the 
+query is not good enough, your job is to enhance it using the context so that it's closer to the user's actual intention.
 
 Instructions:
 1. Analyze the query and context thoroughly.
@@ -30,7 +31,7 @@
 5. Ensure the enhanced query remains focused and concise while being more descriptive and targeted.
 6. You may guess the language based on the context provided.
 
-Output format: Provide only the enhanced query. Do not include any explanatory text or additional commentary.'''
+Output format: Provide only the enhanced query in plain text. Do not include any explanatory text or additional commentary.'''
 
 
 
@@ -45,6 +46,7 @@
 RESPONSE GUIDELINES:
 
 Most importantly - If you are not sure about the answer, say so. Ask user politely for more context and tell them to use "@codebase" to provide more context.
+If you think the provided context is not enough to answer the query, you can ask the user to provide more context.
 
 1. Code References:
    - Use `inline code` for methods, variables, and short snippets

From 19734b49e6b72fe51c34a8b43882fc39c6347e0b Mon Sep 17 00:00:00 2001
From: sankalp1999 <45618047+sankalp1999@users.noreply.github.com>
Date: Sun, 17 Nov 2024 22:24:31 +0530
Subject: [PATCH 11/12] parallel call

---
 app.py | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/app.py b/app.py
index 79f041b..e393157 100644
--- a/app.py
+++ b/app.py
@@ -282,19 +282,32 @@ def rerank_class_search():
     final_search_time = time.time()
     app.logger.info(f"Final DB search took: {final_search_time - rerank_time:.2f} seconds")
 
-    # Combine documents
-    top_3_methods = method_docs[:3]
-    methods_combined = "\n\n".join(
-        f"File: {doc['file_path']}\nCode:\n{doc['code']}" for doc in top_3_methods
-    )
-
-    top_3_classes = class_docs[:3]
-    classes_combined = "\n\n".join(
-        f"File: {doc['file_path']}\nClass Info:\n{doc['source_code']} References: \n{doc['references']}  \n END OF ROW {i}"
-        for i, doc in enumerate(top_3_classes)
-    )
-
-    final_context = rerank_using_small_model(query, classes_combined + "\n" + methods_combined)
+    def process_methods():
+        top_3_methods = method_docs[:3]
+        methods_combined = "\n\n".join(
+            f"File: {doc['file_path']}\nCode:\n{doc['code']}" for doc in top_3_methods
+        )
+        return rerank_using_small_model(query, methods_combined)
+
+    def process_classes():
+        top_3_classes = class_docs[:3]
+        classes_combined = "\n\n".join(
+            f"File: {doc['file_path']}\nClass Info:\n{doc['source_code']} References: \n{doc['references']}  \n END OF ROW {i}"
+            for i, doc in enumerate(top_3_classes)
+        )
+        return rerank_using_small_model(query, classes_combined)
+
+    # Parallel execution of reranking
+    parallel_start_time = time.time()
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        future_methods = executor.submit(process_methods)
+        future_classes = executor.submit(process_classes)
+        methods_context = future_methods.result()
+        classes_context = future_classes.result()
+    parallel_time = time.time() - parallel_start_time
+    app.logger.info(f"Parallel reranking took: {parallel_time:.2f} seconds")
+
+    final_context = f"{methods_context}\n{classes_context}"
 
     app.logger.info(f"Final context: {final_context}")
 

From 273536911927ebf6fba2e7d6f4e51ac2b6ce98a3 Mon Sep 17 00:00:00 2001
From: sankalp1999 <45618047+sankalp1999@users.noreply.github.com>
Date: Sun, 17 Nov 2024 23:42:01 +0530
Subject: [PATCH 12/12] writeup

---
 README.md | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 43f5aa1..81e508b 100644
--- a/README.md
+++ b/README.md
@@ -11,22 +11,9 @@ A powerful code search and query system that lets you explore codebases using na
 
 # Optimized Branch
 
-The code on this branch runs in the 10-20 seconds range, roughly 2.5x faster than the main branch.
-Main changes:
+Please read this quick write up about the optimizations [here](https://sankalp.bearblog.dev/lessons-from-speeding-up-codeqa/)
+This branch runs 2.5x faster than the main branch in worst case.
 
-Reduced the max_token counts to 400 for HYDE and 768 for HYDE-v2. This ensures that the hallucinated output is not too long thus saving time. We just
-need the minimal amount of relevant output that is close to our intended query and less than 512 tokens are more than enough (i.e ~300-400 words)
-
-This step saves roughly 5-10 seconds.
-
-2. Context filtering + Faster inference
-
-a) Using SambaNova Llama 3.1 8b as a context filterer after fetching the final context + metadata from vector db 
-b) Using SambaNova Llama 3.1-70b instead of gpt4o for chat response 400 tok/s
-
-The above again try to reduce the amount of irrelevant context and process the relevant context as fast as possible.
-
-This step saves 20 seconds. 
 ## What is CodeQA?
 
 CodeQA helps you understand codebases by: