Skip to content

Commit 285f787

Browse files
authored
Merge pull request #48 from DefangLabs/data-dir
Create endpoint for exposing samples and knowledge base data
2 parents 37f1465 + 9ffd854 commit 285f787

File tree

8 files changed

+1054
-139
lines changed

8 files changed

+1054
-139
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ This application demonstrates how to deploy a Flask-based Retrieval-Augmented Ge
4141
## Configuration
4242

4343
- The knowledge base is the all the markdown files in the Defang docs [website](https://docs.defang.io/docs/intro). The logic for parsing can be found in `./app/get_knowledge_base.py`.
44-
- The file `get_knowledge_base.py` parses every webpage as specified into paragraphs and writes to `knowledge_base.json` for the RAG retrieval.
44+
- The file `get_knowledge_base.py` parses every webpage as specified into paragraphs and writes to `./data/knowledge_base.json` for the RAG retrieval.
4545
- To obtain your own knowledge base, please feel free to implement your own parsing scheme.
4646
- for local development, please use the `compose.dev.yaml` file where as for production, please use the `compose.yaml`.
4747

app/app.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from flask import Flask, request, jsonify, render_template, Response, stream_with_context, session
1+
from flask import Flask, request, jsonify, render_template, Response, stream_with_context, session, send_from_directory
22
from flask_wtf.csrf import CSRFProtect
33
from rag_system import rag_system
44
import hashlib
@@ -18,7 +18,6 @@
1818

1919
csrf = CSRFProtect(app)
2020

21-
2221
def validate_pow(nonce, data, difficulty):
2322
# Calculate the sha256 of the concatenated string of 32-bit X-Nonce header and raw body.
2423
# This calculation has to match the code on the client side, in index.html.
@@ -102,6 +101,15 @@ def trigger_rebuild():
102101

103102
print("Finished running get_knowledge_base.py script.")
104103

104+
# get Dockerfiles and compose files from samples repo
105+
print("Running get_samples_examples.py script...")
106+
result = subprocess.run(["python3", "get_samples_examples.py"], capture_output=True, text=True)
107+
if result.returncode != 0:
108+
print(f"Error running get_samples_examples.py script: {result.stderr}")
109+
return jsonify({"error": "Error running get_samples_examples.py script", "details": result.stderr}), 500
110+
111+
print("Finished running get_samples_examples.py script.")
112+
105113
print("Rebuilding embeddings...")
106114
try:
107115
rag_system.rebuild_embeddings()
@@ -116,6 +124,13 @@ def trigger_rebuild():
116124
print(f"Error in /trigger-rebuild endpoint: {e}")
117125
return jsonify({"error": "Internal Server Error"}), 500
118126

127+
@app.route("/data/<path:name>")
128+
@csrf.exempt
129+
def download_file(name):
130+
return send_from_directory(
131+
"data", name, as_attachment=True
132+
)
133+
119134
if os.getenv('DEBUG') == '1':
120135
@app.route('/ask/debug', methods=['POST'])
121136
def debug_context():

app/knowledge_base.json renamed to app/data/knowledge_base.json

Lines changed: 308 additions & 128 deletions
Large diffs are not rendered by default.

app/data/samples_examples.json

Lines changed: 569 additions & 0 deletions
Large diffs are not rendered by default.

app/get_knowledge_base.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import json
66
from git import Repo
77

8+
kb_file_path = './data/knowledge_base.json'
9+
810
def clean_tmp(dir_path):
911
""" Clears out all contents of the specified directory except for prebuild.sh """
1012
for item in os.listdir(dir_path):
@@ -80,14 +82,14 @@ def parse_markdown():
8082

8183
def reset_knowledge_base():
8284
""" Resets or initializes the knowledge base JSON file. """
83-
with open('./knowledge_base.json', 'w') as output_file:
85+
with open(kb_file_path, 'w') as output_file:
8486
json.dump([], output_file)
8587

8688
def parse_markdown_file_to_json(file_path):
8789
""" Parses individual markdown file and adds its content to JSON """
8890
try:
8991
# Load existing content if the file exists
90-
with open('./knowledge_base.json', 'r') as existing_file:
92+
with open(kb_file_path, 'r') as existing_file:
9193
json_output = json.load(existing_file)
9294
current_id = len(json_output) + 1 # Start ID from the next available number
9395
except (FileNotFoundError, json.JSONDecodeError):
@@ -147,15 +149,15 @@ def parse_markdown_file_to_json(file_path):
147149
})
148150
current_id += 1
149151

150-
# Write the augmented JSON output to knowledge_base.json
151-
with open('./knowledge_base.json', 'w', encoding='utf-8') as output_file:
152+
# Write the augmented JSON output to ./data/knowledge_base.json
153+
with open(kb_file_path, 'w', encoding='utf-8') as output_file:
152154
json.dump(json_output, output_file, indent=2, ensure_ascii=False)
153155

154156
def parse_cli_markdown(file_path):
155157
""" Parses CLI-specific markdown files """
156158
try:
157159
# Load existing content if the file exists
158-
with open('./knowledge_base.json', 'r') as existing_file:
160+
with open(kb_file_path, 'r') as existing_file:
159161
json_output = json.load(existing_file)
160162
current_id = len(json_output) + 1 # Start ID from the next available number
161163
except (FileNotFoundError, json.JSONDecodeError):
@@ -187,8 +189,8 @@ def parse_cli_markdown(file_path):
187189
})
188190
current_id += 1
189191

190-
# Write the augmented JSON output to knowledge_base.json
191-
with open('./knowledge_base.json', 'w', encoding='utf-8') as output_file:
192+
# Write the augmented JSON output to data/knowledge_base.json
193+
with open(kb_file_path, 'w', encoding='utf-8') as output_file:
192194
json.dump(json_output, output_file, indent=2, ensure_ascii=False)
193195

194196
def recursive_parse_directory(root_dir):

app/get_samples_examples.py

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
#!/usr/bin/env python
2+
import os
3+
import json
4+
import shutil
5+
import tempfile
6+
import subprocess
7+
import yaml
8+
import re
9+
10+
def clone_repo(repo_url, target_dir):
11+
"""Clone the repository to a temporary directory."""
12+
print(f"Cloning {repo_url} to {target_dir}...")
13+
subprocess.run(["git", "clone", repo_url, target_dir], check=True)
14+
print("Repository cloned successfully.")
15+
16+
def get_technologies(dockerfile_content, compose_content):
17+
"""Extract technologies used in the project based on Dockerfile and compose file."""
18+
technologies = []
19+
20+
# Check for common base images in Dockerfile
21+
if "FROM python" in dockerfile_content:
22+
technologies.append("Python")
23+
if "FROM node" in dockerfile_content:
24+
technologies.append("Node.js")
25+
if "FROM golang" in dockerfile_content or "FROM golang:" in dockerfile_content:
26+
technologies.append("Go")
27+
if "FROM php" in dockerfile_content:
28+
technologies.append("PHP")
29+
if "FROM ruby" in dockerfile_content:
30+
technologies.append("Ruby")
31+
if "FROM rust" in dockerfile_content:
32+
technologies.append("Rust")
33+
34+
# Check for frameworks and libraries in Dockerfile
35+
if "pip install" in dockerfile_content and "flask" in dockerfile_content.lower():
36+
technologies.append("Flask")
37+
if "pip install" in dockerfile_content and "django" in dockerfile_content.lower():
38+
technologies.append("Django")
39+
if "npm install" in dockerfile_content and "react" in dockerfile_content.lower():
40+
technologies.append("React")
41+
if "npm install" in dockerfile_content and "express" in dockerfile_content.lower():
42+
technologies.append("Express.js")
43+
if "npm install" in dockerfile_content and "next" in dockerfile_content.lower():
44+
technologies.append("Next.js")
45+
46+
# Check compose file for services
47+
if compose_content:
48+
if "postgres" in compose_content.lower():
49+
technologies.append("PostgreSQL")
50+
if "mysql" in compose_content.lower():
51+
technologies.append("MySQL")
52+
if "redis" in compose_content.lower():
53+
technologies.append("Redis")
54+
if "mongodb" in compose_content.lower():
55+
technologies.append("MongoDB")
56+
57+
return list(set(technologies)) # Remove duplicates
58+
59+
def generate_description(project_name, technologies):
60+
"""Generate a simple description based on project name and technologies."""
61+
tech_str = ", ".join(technologies)
62+
return f"A {tech_str} application that demonstrates how to deploy a {project_name} project with Defang."
63+
64+
def process_sample_directory(sample_dir):
65+
"""Process a sample directory and extract relevant information."""
66+
project_name = os.path.basename(sample_dir)
67+
print(f"Processing sample: {project_name}")
68+
69+
# Find compose file
70+
compose_file = None
71+
for filename in ["compose.yaml", "compose.yml", "docker-compose.yaml", "docker-compose.yml"]:
72+
potential_file = os.path.join(sample_dir, filename)
73+
if os.path.exists(potential_file):
74+
compose_file = potential_file
75+
break
76+
77+
# Find Dockerfile
78+
dockerfile = None
79+
for root, _, files in os.walk(sample_dir):
80+
for file in files:
81+
if file == "Dockerfile":
82+
dockerfile = os.path.join(root, file)
83+
break
84+
if dockerfile:
85+
break
86+
87+
if not compose_file and not dockerfile:
88+
print(f"Skipping {project_name}: No compose file or Dockerfile found")
89+
return None
90+
91+
result = {"projectName": project_name}
92+
93+
# Extract compose content
94+
if compose_file:
95+
with open(compose_file, 'r') as f:
96+
compose_content = f.read()
97+
result["compose"] = compose_content
98+
else:
99+
result["compose"] = ""
100+
101+
# Extract Dockerfile content
102+
if dockerfile:
103+
with open(dockerfile, 'r') as f:
104+
dockerfile_content = f.read()
105+
result["dockerfile"] = dockerfile_content
106+
else:
107+
result["dockerfile"] = ""
108+
109+
# Generate technologies and description
110+
technologies = get_technologies(result.get("dockerfile", ""), result.get("compose", ""))
111+
result["technologies"] = technologies
112+
result["description"] = generate_description(project_name, technologies)
113+
114+
return result
115+
116+
def main():
117+
repo_url = "https://github.com/DefangLabs/samples"
118+
output_file = "./data/samples_examples.json"
119+
120+
with tempfile.TemporaryDirectory() as temp_dir:
121+
# Clone the repository
122+
clone_repo(repo_url, temp_dir)
123+
124+
# Process only the /samples directory
125+
samples_dir = os.path.join(temp_dir, "samples")
126+
if not os.path.exists(samples_dir):
127+
print(f"Error: samples directory not found in {temp_dir}")
128+
return
129+
130+
# Get all subdirectories in the samples directory
131+
sample_dirs = [os.path.join(samples_dir, d) for d in os.listdir(samples_dir)
132+
if os.path.isdir(os.path.join(samples_dir, d))]
133+
134+
# Process each sample directory
135+
results = []
136+
for sample_dir in sample_dirs:
137+
sample_data = process_sample_directory(sample_dir)
138+
if sample_data:
139+
results.append(sample_data)
140+
141+
# Write results to JSON file
142+
with open(output_file, 'w') as f:
143+
json.dump(results, f, indent=2)
144+
145+
print(f"Successfully processed {len(results)} samples. Results saved to {output_file}")
146+
147+
if __name__ == "__main__":
148+
main()

app/rag_system.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
openai.api_key = os.getenv("OPENAI_API_KEY")
1313

1414
class RAGSystem:
15-
def __init__(self, knowledge_base_path='knowledge_base.json'):
15+
def __init__(self, knowledge_base_path='./data/knowledge_base.json'):
1616
self.knowledge_base_path = knowledge_base_path
1717
self.knowledge_base = self.load_knowledge_base()
1818
self.model = SentenceTransformer('all-MiniLM-L6-v2')

app/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ sentence-transformers==2.2.1
88
torch==1.10.0
99
huggingface_hub==0.8.1
1010
openai==0.28.0
11+
PyYAML==6.0.2

0 commit comments

Comments
 (0)