Skip to content

Commit 5469c83

Browse files
Merge pull request #64 from DefangLabs/jordan/refactorings
Refactoring get_knowledge_base.py
2 parents 1663b55 + 172b432 commit 5469c83

File tree

5 files changed

+49
-131
lines changed

5 files changed

+49
-131
lines changed

.devcontainer/devcontainer.json

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,13 @@
55
},
66
"features": {
77
"ghcr.io/defanglabs/devcontainer-feature/defang-cli:1.0.4": {},
8-
"ghcr.io/devcontainers/features/docker-in-docker:2": {}
8+
"ghcr.io/devcontainers/features/docker-in-docker:2": {},
9+
"ghcr.io/devcontainers/features/node:1": {
10+
"version": "20"
11+
},
12+
"ghcr.io/devcontainers/features/go:1": {
13+
"version": "1.23"
14+
}
915
},
1016
"mounts": [
1117
// "source=/Users/user/.aws,target=/home/vscode/.aws,type=bind,consistency=cached",

.gitignore

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
.env
22
__pycache__
33
sentence-transformers
4-
.tmp/*
5-
!.tmp/prebuild.sh
4+
.tmp/
65
node_modules

app/.tmp/prebuild.sh

Lines changed: 0 additions & 31 deletions
This file was deleted.

app/Dockerfile

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,6 @@ RUN pip install --no-cache-dir -r requirements.txt
3636
# Copy the application source code into the container
3737
COPY . /app
3838

39-
# Make the prebuild.sh script executable
40-
RUN chmod +x .tmp/prebuild.sh
41-
4239
# Expose port 5050 for the Flask application
4340
EXPOSE 5050
4441

app/get_knowledge_base.py

Lines changed: 41 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,6 @@
77

88
kb_file_path = './data/knowledge_base.json'
99

10-
def clean_tmp(dir_path):
11-
""" Clears out all contents of the specified directory except for prebuild.sh """
12-
for item in os.listdir(dir_path):
13-
item_path = os.path.join(dir_path, item)
14-
if item != "prebuild.sh": # Keep prebuild.sh
15-
if os.path.isdir(item_path):
16-
shutil.rmtree(item_path)
17-
else:
18-
os.remove(item_path)
19-
2010
def clone_repository(repo_url, local_dir):
2111
""" Clone or pull the repository based on its existence. """
2212
if not os.path.exists(local_dir):
@@ -30,7 +20,6 @@ def clone_repository(repo_url, local_dir):
3020
def setup_repositories():
3121
tmp_dir = ".tmp"
3222
os.makedirs(tmp_dir, exist_ok=True)
33-
clean_tmp(tmp_dir) # Clean the temporary directory before setting up
3423

3524
# Define repositories and their URLs
3625
repos = {
@@ -39,65 +28,34 @@ def setup_repositories():
3928
"samples": "https://github.com/DefangLabs/samples.git"
4029
}
4130

42-
# Change to the temporary directory
43-
original_dir = os.getcwd()
44-
os.chdir(tmp_dir)
45-
4631
# Clone each repository
4732
for repo_name, repo_url in repos.items():
48-
clone_repository(repo_url, repo_name)
49-
50-
# Return to the original directory
51-
os.chdir(original_dir)
33+
clone_repository(repo_url, os.path.join(tmp_dir, repo_name))
5234

5335
def run_prebuild_script():
54-
""" Run the 'prebuild.sh' script located in the .tmp directory. """
55-
os.chdir(".tmp")
56-
script_path = os.path.join("./", "prebuild.sh") # Ensure the path is correct
57-
if os.path.exists(script_path):
58-
print("Running prebuild.sh...")
59-
try:
60-
subprocess.run(["bash", script_path], check=True)
61-
except subprocess.CalledProcessError as e:
62-
print(f"Error running prebuild.sh: {e}")
63-
else:
64-
print("prebuild.sh not found.")
65-
66-
def cleanup():
67-
""" Clean up unneeded files, preserving only 'docs' and 'blog' directories """
68-
os.chdir("./defang-docs")
69-
for item in os.listdir('.'):
70-
if item not in ['docs', 'blog']: # Check if the item is not one of the directories to keep
71-
item_path = os.path.join('.', item) # Construct the full path
72-
if os.path.isdir(item_path):
73-
shutil.rmtree(item_path) # Remove the directory and all its contents
74-
else:
75-
os.remove(item_path) # Remove the file
76-
print("Cleanup completed successfully.")
36+
""" Run the defang-docs repo prebuild script"""
37+
38+
subprocess.run(
39+
["npm", "-C", ".tmp/defang-docs", "install"],
40+
check=True,
41+
stdout=subprocess.PIPE,
42+
stderr=subprocess.PIPE
43+
)
44+
45+
subprocess.run(
46+
["npm", "-C", ".tmp/defang-docs", "run", "prebuild"],
47+
check=True,
48+
stdout=subprocess.PIPE,
49+
stderr=subprocess.PIPE
50+
)
7751

7852
def parse_markdown():
7953
""" Parse markdown files in the current directory into JSON """
80-
reset_knowledge_base() # Reset the JSON database file
8154
recursive_parse_directory('./.tmp/defang-docs') # Parse markdown files in the current directory
8255
print("Markdown parsing completed successfully.")
8356

84-
def reset_knowledge_base():
85-
""" Resets or initializes the knowledge base JSON file. """
86-
with open(kb_file_path, 'w') as output_file:
87-
json.dump([], output_file)
88-
89-
def parse_markdown_file_to_json(file_path):
57+
def parse_markdown_file_to_json(json_output, current_id, file_path):
9058
""" Parses individual markdown file and adds its content to JSON """
91-
try:
92-
# Load existing content if the file exists
93-
with open(kb_file_path, 'r') as existing_file:
94-
json_output = json.load(existing_file)
95-
current_id = len(json_output) + 1 # Start ID from the next available number
96-
except (FileNotFoundError, json.JSONDecodeError):
97-
# If the file doesn't exist or is empty, start fresh
98-
json_output = []
99-
current_id = 1
100-
10159
with open(file_path, 'r', encoding='utf-8') as file:
10260
lines = file.readlines()
10361

@@ -148,28 +106,17 @@ def parse_markdown_file_to_json(file_path):
148106
"text": text,
149107
"path": adjust_knowledge_base_entry_path(file_path) # Adjust path format
150108
})
151-
current_id += 1
152-
153-
# Write the augmented JSON output to ./data/knowledge_base.json
154-
with open(kb_file_path, 'w', encoding='utf-8') as output_file:
155-
json.dump(json_output, output_file, indent=2, ensure_ascii=False)
156109

157110
def adjust_knowledge_base_entry_path(file_path):
158111
""" Adjusts the file path format for storage. """
159-
return re.sub(r'\/(\d{4})-(\d{2})-(\d{2})-', r'/\1/\2/\3/', file_path.replace("./.tmp/defang-docs", "").replace(".mdx", "").replace(".md", ""))
112+
return re.sub(r'\/(\d{4})-(\d{2})-(\d{2})-', r'/\1/\2/\3/', normalize_docs_path(file_path))
160113

161-
def parse_cli_markdown(file_path):
162-
""" Parses CLI-specific markdown files """
163-
try:
164-
# Load existing content if the file exists
165-
with open(kb_file_path, 'r') as existing_file:
166-
json_output = json.load(existing_file)
167-
current_id = len(json_output) + 1 # Start ID from the next available number
168-
except (FileNotFoundError, json.JSONDecodeError):
169-
# If the file doesn't exist or is empty, start fresh
170-
json_output = []
171-
current_id = 1
114+
def normalize_docs_path(path):
115+
""" Normalizes the file path to ensure consistent formatting. """
116+
return path.replace("./.tmp/defang-docs", "").replace(".mdx", "").replace(".md", "")
172117

118+
def parse_cli_markdown(json_output, current_id, file_path):
119+
""" Parses CLI-specific markdown files """
173120
with open(file_path, 'r', encoding='utf-8') as file:
174121
lines = file.readlines()
175122

@@ -190,32 +137,32 @@ def parse_cli_markdown(file_path):
190137
"id": current_id,
191138
"about": about,
192139
"text": text,
193-
"path": file_path.replace("./.tmp/defang-docs", "").replace(".mdx", "").replace(".md", "")
140+
"path": normalize_docs_path(file_path)
194141
})
195-
current_id += 1
196-
197-
# Write the augmented JSON output to data/knowledge_base.json
198-
with open(kb_file_path, 'w', encoding='utf-8') as output_file:
199-
json.dump(json_output, output_file, indent=2, ensure_ascii=False)
200142

201143
def recursive_parse_directory(root_dir):
202144
""" Recursively parses all markdown files in the directory. """
203-
for dirpath, dirnames, filenames in os.walk(root_dir):
145+
paths = []
146+
for dirpath, _dirnames, filenames in os.walk(root_dir):
204147
for filename in filenames:
205-
if filename.lower().endswith('.md') or filename.lower().endswith('.mdx'):
206-
file_path = os.path.join(dirpath, filename)
207-
if 'cli' in dirpath.lower() or 'cli' in filename.lower():
208-
parse_cli_markdown(file_path)
209-
else:
210-
parse_markdown_file_to_json(file_path)
148+
lower_filename = filename.lower()
149+
if lower_filename.endswith('.md') or lower_filename.endswith('.mdx'):
150+
paths.append(os.path.join(dirpath, filename))
151+
152+
with open(kb_file_path, 'r') as kb_file:
153+
kb_data = json.load(kb_file)
154+
155+
for id, file_path in enumerate(paths, start=1):
156+
if 'cli' in dirpath.lower() or 'cli' in filename.lower():
157+
parse_cli_markdown(kb_data, id, file_path)
158+
else:
159+
parse_markdown_file_to_json(kb_data, id, file_path)
160+
161+
with open(kb_file_path, 'w') as kb_file:
162+
json.dump(kb_data, kb_file, indent=2)
211163

212164
if __name__ == "__main__":
213165
setup_repositories()
214166
run_prebuild_script()
215-
cleanup()
216-
os.chdir('../../')
217-
print(os.listdir('.'))
218167
parse_markdown() # Start parsing logic after all setups
219-
print(os.listdir('.'))
220-
clean_tmp('./.tmp')
221168
print("All processes completed successfully.")

0 commit comments

Comments
 (0)