Merge pull request #48 from DefangLabs/data-dir

commit111 · web-flow · commit 285f787ab8e7 · 2025-04-15T18:21:13.000-07:00
Create endpoint for exposing samples and knowledge base data
diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ This application demonstrates how to deploy a Flask-based Retrieval-Augmented Ge
 ## Configuration
 
 - The knowledge base is the all the markdown files in the Defang docs [website](https://docs.defang.io/docs/intro). The logic for parsing can be found in `./app/get_knowledge_base.py`.
-- The file `get_knowledge_base.py` parses every webpage as specified into paragraphs and writes to `knowledge_base.json` for the RAG retrieval.
+- The file `get_knowledge_base.py` parses every webpage as specified into paragraphs and writes to `./data/knowledge_base.json` for the RAG retrieval.
 - To obtain your own knowledge base, please feel free to implement your own parsing scheme.
 - for local development, please use the `compose.dev.yaml` file where as for production, please use the `compose.yaml`.
 
diff --git a/app/app.py b/app/app.py
@@ -1,4 +1,4 @@
-from flask import Flask, request, jsonify, render_template, Response, stream_with_context, session
+from flask import Flask, request, jsonify, render_template, Response, stream_with_context, session, send_from_directory
 from flask_wtf.csrf import CSRFProtect
 from rag_system import rag_system
 import hashlib
@@ -18,7 +18,6 @@
 
 csrf = CSRFProtect(app)
 
-
 def validate_pow(nonce, data, difficulty):
     # Calculate the sha256 of the concatenated string of 32-bit X-Nonce header and raw body.
     # This calculation has to match the code on the client side, in index.html.
@@ -102,6 +101,15 @@ def trigger_rebuild():
 
         print("Finished running get_knowledge_base.py script.")
 
+        # get Dockerfiles and compose files from samples repo
+        print("Running get_samples_examples.py script...")
+        result = subprocess.run(["python3", "get_samples_examples.py"], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error running get_samples_examples.py script: {result.stderr}")
+            return jsonify({"error": "Error running get_samples_examples.py script", "details": result.stderr}), 500
+
+        print("Finished running get_samples_examples.py script.")
+
         print("Rebuilding embeddings...")
         try:
             rag_system.rebuild_embeddings()
@@ -116,6 +124,13 @@ def trigger_rebuild():
         print(f"Error in /trigger-rebuild endpoint: {e}")
         return jsonify({"error": "Internal Server Error"}), 500
 
+@app.route("/data/<path:name>")
+@csrf.exempt
+def download_file(name):
+    return send_from_directory(
+        "data", name, as_attachment=True
+    )
+
 if os.getenv('DEBUG') == '1':
     @app.route('/ask/debug', methods=['POST'])
     def debug_context():
diff --git a/app/data/knowledge_base.json b/app/data/knowledge_base.json
diff --git a/app/data/samples_examples.json b/app/data/samples_examples.json
diff --git a/app/get_knowledge_base.py b/app/get_knowledge_base.py
@@ -5,6 +5,8 @@
 import json
 from git import Repo
 
+kb_file_path = './data/knowledge_base.json'
+
 def clean_tmp(dir_path):
     """ Clears out all contents of the specified directory except for prebuild.sh """
     for item in os.listdir(dir_path):
@@ -80,14 +82,14 @@ def parse_markdown():
 
 def reset_knowledge_base():
     """ Resets or initializes the knowledge base JSON file. """
-    with open('./knowledge_base.json', 'w') as output_file:
+    with open(kb_file_path, 'w') as output_file:
         json.dump([], output_file)
 
 def parse_markdown_file_to_json(file_path):
     """ Parses individual markdown file and adds its content to JSON """
     try:
         # Load existing content if the file exists
-        with open('./knowledge_base.json', 'r') as existing_file:
+        with open(kb_file_path, 'r') as existing_file:
             json_output = json.load(existing_file)
             current_id = len(json_output) + 1  # Start ID from the next available number
     except (FileNotFoundError, json.JSONDecodeError):
@@ -147,15 +149,15 @@ def parse_markdown_file_to_json(file_path):
             })
             current_id += 1
 
-    # Write the augmented JSON output to knowledge_base.json
-    with open('./knowledge_base.json', 'w', encoding='utf-8') as output_file:
+    # Write the augmented JSON output to ./data/knowledge_base.json
+    with open(kb_file_path, 'w', encoding='utf-8') as output_file:
         json.dump(json_output, output_file, indent=2, ensure_ascii=False)
 
 def parse_cli_markdown(file_path):
     """ Parses CLI-specific markdown files """
     try:
         # Load existing content if the file exists
-        with open('./knowledge_base.json', 'r') as existing_file:
+        with open(kb_file_path, 'r') as existing_file:
             json_output = json.load(existing_file)
             current_id = len(json_output) + 1  # Start ID from the next available number
     except (FileNotFoundError, json.JSONDecodeError):
@@ -187,8 +189,8 @@ def parse_cli_markdown(file_path):
         })
         current_id += 1
 
-    # Write the augmented JSON output to knowledge_base.json
-    with open('./knowledge_base.json', 'w', encoding='utf-8') as output_file:
+    # Write the augmented JSON output to data/knowledge_base.json
+    with open(kb_file_path, 'w', encoding='utf-8') as output_file:
         json.dump(json_output, output_file, indent=2, ensure_ascii=False)
 
 def recursive_parse_directory(root_dir):
diff --git a/app/get_samples_examples.py b/app/get_samples_examples.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+import os
+import json
+import shutil
+import tempfile
+import subprocess
+import yaml
+import re
+
+def clone_repo(repo_url, target_dir):
+    """Clone the repository to a temporary directory."""
+    print(f"Cloning {repo_url} to {target_dir}...")
+    subprocess.run(["git", "clone", repo_url, target_dir], check=True)
+    print("Repository cloned successfully.")
+
+def get_technologies(dockerfile_content, compose_content):
+    """Extract technologies used in the project based on Dockerfile and compose file."""
+    technologies = []
+    
+    # Check for common base images in Dockerfile
+    if "FROM python" in dockerfile_content:
+        technologies.append("Python")
+    if "FROM node" in dockerfile_content:
+        technologies.append("Node.js")
+    if "FROM golang" in dockerfile_content or "FROM golang:" in dockerfile_content:
+        technologies.append("Go")
+    if "FROM php" in dockerfile_content:
+        technologies.append("PHP")
+    if "FROM ruby" in dockerfile_content:
+        technologies.append("Ruby")
+    if "FROM rust" in dockerfile_content:
+        technologies.append("Rust")
+    
+    # Check for frameworks and libraries in Dockerfile
+    if "pip install" in dockerfile_content and "flask" in dockerfile_content.lower():
+        technologies.append("Flask")
+    if "pip install" in dockerfile_content and "django" in dockerfile_content.lower():
+        technologies.append("Django")
+    if "npm install" in dockerfile_content and "react" in dockerfile_content.lower():
+        technologies.append("React")
+    if "npm install" in dockerfile_content and "express" in dockerfile_content.lower():
+        technologies.append("Express.js")
+    if "npm install" in dockerfile_content and "next" in dockerfile_content.lower():
+        technologies.append("Next.js")
+    
+    # Check compose file for services
+    if compose_content:
+        if "postgres" in compose_content.lower():
+            technologies.append("PostgreSQL")
+        if "mysql" in compose_content.lower():
+            technologies.append("MySQL")
+        if "redis" in compose_content.lower():
+            technologies.append("Redis")
+        if "mongodb" in compose_content.lower():
+            technologies.append("MongoDB")
+    
+    return list(set(technologies))  # Remove duplicates
+
+def generate_description(project_name, technologies):
+    """Generate a simple description based on project name and technologies."""
+    tech_str = ", ".join(technologies)
+    return f"A {tech_str} application that demonstrates how to deploy a {project_name} project with Defang."
+
+def process_sample_directory(sample_dir):
+    """Process a sample directory and extract relevant information."""
+    project_name = os.path.basename(sample_dir)
+    print(f"Processing sample: {project_name}")
+    
+    # Find compose file
+    compose_file = None
+    for filename in ["compose.yaml", "compose.yml", "docker-compose.yaml", "docker-compose.yml"]:
+        potential_file = os.path.join(sample_dir, filename)
+        if os.path.exists(potential_file):
+            compose_file = potential_file
+            break
+    
+    # Find Dockerfile
+    dockerfile = None
+    for root, _, files in os.walk(sample_dir):
+        for file in files:
+            if file == "Dockerfile":
+                dockerfile = os.path.join(root, file)
+                break
+        if dockerfile:
+            break
+    
+    if not compose_file and not dockerfile:
+        print(f"Skipping {project_name}: No compose file or Dockerfile found")
+        return None
+    
+    result = {"projectName": project_name}
+    
+    # Extract compose content
+    if compose_file:
+        with open(compose_file, 'r') as f:
+            compose_content = f.read()
+            result["compose"] = compose_content
+    else:
+        result["compose"] = ""
+    
+    # Extract Dockerfile content
+    if dockerfile:
+        with open(dockerfile, 'r') as f:
+            dockerfile_content = f.read()
+            result["dockerfile"] = dockerfile_content
+    else:
+        result["dockerfile"] = ""
+    
+    # Generate technologies and description
+    technologies = get_technologies(result.get("dockerfile", ""), result.get("compose", ""))
+    result["technologies"] = technologies
+    result["description"] = generate_description(project_name, technologies)
+    
+    return result
+
+def main():
+    repo_url = "https://github.com/DefangLabs/samples"
+    output_file = "./data/samples_examples.json"
+    
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Clone the repository
+        clone_repo(repo_url, temp_dir)
+        
+        # Process only the /samples directory
+        samples_dir = os.path.join(temp_dir, "samples")
+        if not os.path.exists(samples_dir):
+            print(f"Error: samples directory not found in {temp_dir}")
+            return
+        
+        # Get all subdirectories in the samples directory
+        sample_dirs = [os.path.join(samples_dir, d) for d in os.listdir(samples_dir) 
+                      if os.path.isdir(os.path.join(samples_dir, d))]
+        
+        # Process each sample directory
+        results = []
+        for sample_dir in sample_dirs:
+            sample_data = process_sample_directory(sample_dir)
+            if sample_data:
+                results.append(sample_data)
+        
+        # Write results to JSON file
+        with open(output_file, 'w') as f:
+            json.dump(results, f, indent=2)
+        
+        print(f"Successfully processed {len(results)} samples. Results saved to {output_file}")
+
+if __name__ == "__main__":
+    main()
diff --git a/app/rag_system.py b/app/rag_system.py
@@ -12,7 +12,7 @@
 openai.api_key = os.getenv("OPENAI_API_KEY")
 
 class RAGSystem:
-    def __init__(self, knowledge_base_path='knowledge_base.json'):
+    def __init__(self, knowledge_base_path='./data/knowledge_base.json'):
         self.knowledge_base_path = knowledge_base_path
         self.knowledge_base = self.load_knowledge_base()
         self.model = SentenceTransformer('all-MiniLM-L6-v2')
diff --git a/app/requirements.txt b/app/requirements.txt
@@ -8,3 +8,4 @@ sentence-transformers==2.2.1
 torch==1.10.0
 huggingface_hub==0.8.1
 openai==0.28.0
+PyYAML==6.0.2