wip commit

commit111 · commit111 · commit a485225f8056 · 2025-07-11T10:59:50.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@ sentence-transformers
 .tmp/*
 !.tmp/prebuild.sh
 node_modules
+venv
diff --git a/app/.dockerignore b/app/.dockerignore
@@ -2,3 +2,5 @@ myenv
 .direnv
 .envrc
 __pycache__
+venv
+sentence-transformers
diff --git a/app/Dockerfile b/app/Dockerfile
@@ -12,10 +12,10 @@ RUN apt-get update && apt-get install -y \
     git \
     && rm -rf /var/lib/apt/lists/*
 
-# Install Go for ARM architecture (latest supported version 1.21)
-RUN curl -OL https://golang.org/dl/go1.21.1.linux-arm64.tar.gz && \
-    tar -C /usr/local -xzf go1.21.1.linux-arm64.tar.gz && \
-    rm go1.21.1.linux-arm64.tar.gz
+# Install Go for x86 architecture (latest supported version 1.21)
+RUN curl -OL https://golang.org/dl/go1.21.1.linux-amd64.tar.gz && \
+    tar -C /usr/local -xzf go1.21.1.linux-amd64.tar.gz && \
+    rm go1.21.1.linux-amd64.tar.gz
 
 # Set Go environment variables
 ENV PATH="/usr/local/go/bin:${PATH}"
@@ -42,11 +42,11 @@ RUN chmod +x .tmp/prebuild.sh
 # Expose port 5050 for the Flask application
 EXPOSE 5050
 
-# Run test file
-RUN python test_intercom.py
+# # Run test file
+# RUN python test_intercom.py
 
 # Define environment variable for Flask
 ENV FLASK_APP=app.py
 
 # Run the application using uWSGI
-CMD ["uwsgi", "--http", "0.0.0.0:5050", "--wsgi-file", "app.py", "--callable", "app", "--processes", "4", "--threads", "2"]
+CMD ["uwsgi", "--lazy-apps", "--http", "0.0.0.0:5050", "--wsgi-file", "app.py", "--callable", "app", "--processes", "4", "--threads", "2"]
diff --git a/app/rag_system.py b/app/rag_system.py
@@ -30,17 +30,14 @@ def embed_knowledge_base(self):
     def normalize_query(self, query):
         return query.lower().strip()
 
-    def get_query_embedding(self, query, use_cpu=True):
+    def get_query_embedding(self, query):
         normalized_query = self.normalize_query(query)
         query_embedding = self.model.encode([normalized_query], convert_to_tensor=True)
-        if use_cpu:
-            query_embedding = query_embedding.cpu()
+        query_embedding = query_embedding.cpu()
         return query_embedding
 
-    def get_doc_embeddings(self, use_cpu=True):
-        if use_cpu:
-            return self.doc_embeddings.cpu()
-        return self.doc_embeddings
+    def get_doc_embeddings(self):
+        return self.doc_embeddings.cpu()
 
     def compute_document_scores(self, query_embedding, doc_embeddings, high_match_threshold):
         text_similarities = cosine_similarity(query_embedding, doc_embeddings)[0]
@@ -66,12 +63,9 @@ def compute_document_scores(self, query_embedding, doc_embeddings, high_match_th
 
         return result
 
-    def retrieve(self, query, similarity_threshold=0.4, high_match_threshold=0.8, max_docs=5, use_cpu=True):
-        # Note: Set use_cpu=True to run on CPU, which is useful for testing or environments without a GPU.
-        # Set use_cpu=False to leverage GPU for better performance in production.
-
-        query_embedding = self.get_query_embedding(query, use_cpu)
-        doc_embeddings = self.get_doc_embeddings(use_cpu)
+    def retrieve(self, query, similarity_threshold=0.4, high_match_threshold=0.8, max_docs=5):
+        query_embedding = self.get_query_embedding(query)
+        doc_embeddings = self.get_doc_embeddings()
 
         doc_scores = self.compute_document_scores(query_embedding, doc_embeddings, high_match_threshold)
         retrieved_docs = self.get_top_docs(doc_scores, similarity_threshold, max_docs)
@@ -149,11 +143,11 @@ def answer_query_stream(self, query):
 
             collected_messages = []
             for chunk in stream:
-                if chunk['choices'][0]['finish_reason'] is not None:
-                    break
                 content = chunk['choices'][0]['delta'].get('content', '')
                 collected_messages.append(content)
                 yield content
+                if chunk['choices'][0].get('finish_reason') is not None:
+                    break
 
             if len(citations) > 0:
                 yield "\n\nReferences:\n" + "\n".join(citations)
@@ -193,3 +187,6 @@ def get_context(self, retrieved_docs):
         for doc in retrieved_docs:
             retrieved_text.append(f"{doc['about']}. {doc['text']}")
         return "\n\n".join(retrieved_text)
+
+# # Instantiate the RAGSystem
+# rag_system = RAGSystem()
diff --git a/app/requirements.txt b/app/requirements.txt
@@ -5,7 +5,8 @@ scikit-learn==1.2.2
 segment-analytics-python==2.3.3
 numpy==1.24.4
 sentence-transformers==2.3.1
-torch==2.0.1
+--find-links https://download.pytorch.org/whl/cpu/torch_stable.html
+torch==2.0.1+cpu
 huggingface_hub==0.15.1
 openai==0.28.0
 PyYAML==6.0.2
diff --git a/app/test_rag_system.py b/app/test_rag_system.py
@@ -53,8 +53,7 @@ def test_get_doc_embeddings(self):
     def test_retrieve_fallback(self):
         # test a query that should return the fallback response
         query = "Hello"
-        # set use_cpu to True, as testing has no GPU calculations
-        result = self.rag_system.retrieve(query, use_cpu=True)
+        result = self.rag_system.retrieve(query)
         self.assertIsInstance(result, list)
         self.assertGreater(len(result), 0)
         self.assertEqual(len(result), 1)  # should return one result for fallback
@@ -67,8 +66,7 @@ def test_retrieve_fallback(self):
     def test_retrieve_actual_response(self):
         # test a query that should return an actual response from the knowledge base
         query = "What is Defang?"
-        # set use_cpu to True, as testing has no GPU calculations
-        result = self.rag_system.retrieve(query, use_cpu=True)
+        result = self.rag_system.retrieve(query)
         self.assertIsInstance(result, list)
         self.assertGreater(len(result), 0)
         self.assertLessEqual(len(result), 5)  # should return up to max_docs (5)
@@ -80,9 +78,8 @@ def test_retrieve_actual_response(self):
 
     def test_compute_document_scores(self):
         query = "Does Defang have an MCP sample?"
-        # get embeddings and move them to CPU, as testing has no GPU calculations
-        query_embedding = self.rag_system.get_query_embedding(query, use_cpu=True)
-        doc_embeddings = self.rag_system.get_doc_embeddings(use_cpu=True)
+        query_embedding = self.rag_system.get_query_embedding(query)
+        doc_embeddings = self.rag_system.get_doc_embeddings()
 
         # call function and get results
         result = self.rag_system.compute_document_scores(query_embedding, doc_embeddings, high_match_threshold=0.8)
@@ -105,4 +102,4 @@ def test_compute_document_scores(self):
         print("Test for compute_document_scores passed successfully!")
 
 if __name__ == '__main__':
-    unittest.main()
+    unittest.main()
diff --git a/compose.yaml b/compose.yaml
@@ -1,11 +1,12 @@
 services:
   app:
     restart: always
-    domainname: ask.defang.io
+    # domainname: ask.defang.io
     x-defang-dns-role: arn:aws:iam::258338292852:role/dnsadmin-39a19c3
+    platform: linux/amd64
     build:
       context: ./app
-      shm_size: "30gb"
+      dockerfile: Dockerfile
     ports:
       - target: 5050
         published: 5050
@@ -21,10 +22,11 @@ services:
       SESSION_COOKIE_SECURE: 1
       OPENAI_API_KEY: ${OPENAI_API_KEY} # Set your OpenAI API key here or in the .env file
       OPENAI_BASE_URL: "http://llm/api/v1"
-      MODEL: "anthropic.claude-3-haiku-20240307-v1:0"
+      MODEL: "ai/claude3-haiku"
       INTERCOM_TOKEN:
       INTERCOM_ADMIN_ID:
       REDIS_URL: redis://redis:6379/0
+      SENTENCE_TRANSFORMERS_HOME: /app/sentence-transformers
     deploy:
       resources:
         reservations: