Skip to content

Commit 9d49732

Browse files
committed
embed mesh packages into db; modularize the code
1 parent ec9128a commit 9d49732

File tree

14 files changed

+408
-195
lines changed

14 files changed

+408
-195
lines changed

apps/meshjs-rag/app/api/v1/ask_mesh_ai.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
from pydantic import BaseModel
66
from supabase import AsyncClient
77
import openai
8-
from dotenv import load_dotenv
9-
load_dotenv()
108
import os
119

1210
from app.services.openai import OpenAIService
Lines changed: 52 additions & 176 deletions
Original file line numberDiff line numberDiff line change
@@ -1,189 +1,19 @@
11
import os
2-
from dotenv import load_dotenv
3-
load_dotenv()
42
from fastapi import APIRouter, HTTPException, status, Depends
53
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
64
import pathlib
7-
import asyncio
8-
import openai
95
from supabase import AsyncClient
10-
from tenacity import RetryError
116

12-
from app.utils.get_file_paths import get_file_paths
7+
from app.services.github import GithubService
8+
from app.utils.get_file_paths import get_docs_file_paths, get_packages_file_paths
139
from app.utils.get_file_content import get_file_content
14-
from app.utils.chunk_content import chunk_content_by_h2
15-
from app.services.openai import OpenAIService
1610
from app.db.client import get_db_client
17-
from app.utils.checksum import calculate_checksum
18-
from app.utils.extract_title import extract_chunk_title
19-
20-
openai_api_key = os.getenv("OPENAI_KEY") or None
21-
if openai_api_key is None:
22-
raise ValueError("OpenAI api key is missing")
11+
from app.utils.process_docs_file_and_update_db import process_docs_file_and_update_db
12+
from app.utils.process_package_docs_and_update_db import process_package_docs_and_update_db
2313

2414
router = APIRouter()
25-
openai_service = OpenAIService(openai_api_key=openai_api_key)
2615
security = HTTPBearer()
2716

28-
###########################################################################################################
29-
# HELPER FUNCTIONS
30-
###########################################################################################################
31-
32-
async def safe_db_operation(task):
33-
try:
34-
await task
35-
except Exception as e:
36-
print(f"Database operation failed: {e}")
37-
return None
38-
return True
39-
40-
41-
async def process_file_and_update_db(file_content: str, relative_path: str, supabase: AsyncClient):
42-
chunks = chunk_content_by_h2(file_content)
43-
cache_key = calculate_checksum(file_content)
44-
current_chunk_data = {}
45-
46-
# current chunks
47-
for idx, chunk in enumerate(chunks):
48-
chunk_title = extract_chunk_title(chunk)
49-
50-
if not chunk_title:
51-
continue
52-
current_chunk_data[chunk_title] = {
53-
"chunk": chunk,
54-
"chunk_id": idx,
55-
"checksum": calculate_checksum(chunk)
56-
}
57-
58-
# existing chunks
59-
try:
60-
response = await supabase.table("docs") \
61-
.select("id", "chunk_title", "chunk_id", "checksum") \
62-
.eq("filepath", relative_path) \
63-
.execute()
64-
65-
existing_records = {
66-
record["chunk_title"]: record
67-
for record in response.data
68-
}
69-
except Exception as e:
70-
print(f"Failed to fetch existing records for {relative_path}: {e}")
71-
return
72-
73-
# compare
74-
chunks_to_embed = []
75-
db_operations = []
76-
77-
all_keys = set(current_chunk_data.keys()) | set(existing_records.keys())
78-
79-
for key in all_keys:
80-
chunk_title = key
81-
current = current_chunk_data.get(key)
82-
existing = existing_records.get(key)
83-
84-
if current and existing:
85-
if current["checksum"] == existing["checksum"]:
86-
print(f"Skipping the unchanged chunk: {chunk_title}")
87-
88-
if current["checksum"] != existing["checksum"]:
89-
print(f"Updating chunk: {chunk_title}")
90-
try:
91-
response = await openai_service.situate_context(file_content, current["chunk"], cache_key=cache_key)
92-
await asyncio.sleep(1)
93-
contextual_chunk = "---".join([response, current["chunk"]])
94-
chunks_to_embed.append(contextual_chunk)
95-
96-
db_operations.append({
97-
"filepath": relative_path,
98-
"chunk_id": current["chunk_id"],
99-
"chunk_title": chunk_title,
100-
"checksum": current["checksum"],
101-
"content": current["chunk"],
102-
"record_id": existing["id"],
103-
"is_update": True
104-
})
105-
except (openai.APIError, openai.AuthenticationError, openai.RateLimitError, RetryError) as e:
106-
print(f"Skipping chunk {chunk_title} due to OpenAI 'situation_context' API error: {e}")
107-
continue
108-
109-
elif current["chunk_id"] != existing.get("chunk_id"):
110-
print(f"Updating chunk order for {chunk_title}")
111-
await safe_db_operation(
112-
supabase.table("docs").update({"chunk_id": current["chunk_id"]}).eq("id", existing["id"]).execute()
113-
)
114-
115-
elif current and not existing:
116-
print(f"New chunk {chunk_title}")
117-
try:
118-
response = await openai_service.situate_context(file_content, current["chunk"], cache_key=cache_key)
119-
await asyncio.sleep(1)
120-
contextual_chunk = "---".join([response, current["chunk"]])
121-
chunks_to_embed.append(contextual_chunk)
122-
db_operations.append({
123-
"filepath": relative_path,
124-
"chunk_id": current["chunk_id"],
125-
"chunk_title": chunk_title,
126-
"checksum": current["checksum"],
127-
"content": current["chunk"],
128-
"is_update": False
129-
})
130-
except (openai.APIError, openai.AuthenticationError, openai.RateLimitError, RetryError) as e:
131-
print(f"Skipping chunk {chunk_title} due to OpenAI 'situation_context' API error: {e}")
132-
continue
133-
134-
elif not current and existing:
135-
print(f"Deleting chunk: {chunk_title}")
136-
await safe_db_operation(
137-
supabase.table("docs").delete().eq("id", existing["id"]).execute()
138-
)
139-
140-
if chunks_to_embed:
141-
try:
142-
embeddings = await openai_service.get_batch_embeddings(chunks_to_embed)
143-
except (openai.APIError, openai.AuthenticationError, openai.RateLimitError, RetryError) as e:
144-
print(f"Skipping all DB operations for this file due to failed embedding batch: {e}")
145-
return
146-
147-
db_tasks = []
148-
for i, embedding in enumerate(embeddings):
149-
if not embedding:
150-
print(f"Skipping DB operation for chunk '{db_operations[i]["chunk_title"]}' due to failed embedding")
151-
continue
152-
153-
operation_data = db_operations[i]
154-
155-
if operation_data["is_update"]:
156-
db_tasks.append(
157-
safe_db_operation(
158-
supabase.table("docs").update({
159-
"content": operation_data["content"],
160-
"contextual_text": chunks_to_embed[i],
161-
"embedding": embedding,
162-
"filepath": operation_data["filepath"],
163-
"chunk_id": operation_data["chunk_id"],
164-
"chunk_title": operation_data["chunk_title"],
165-
"checksum": operation_data["checksum"]
166-
}).eq("id", operation_data["record_id"]).execute()
167-
)
168-
)
169-
else:
170-
db_tasks.append(
171-
safe_db_operation(
172-
supabase.table("docs").insert({
173-
"content": operation_data["content"],
174-
"contextual_text": chunks_to_embed[i],
175-
"embedding": embedding,
176-
"filepath": operation_data["filepath"],
177-
"chunk_id": operation_data["chunk_id"],
178-
"chunk_title": operation_data["chunk_title"],
179-
"checksum": operation_data["checksum"]
180-
}).execute()
181-
)
182-
)
183-
184-
await asyncio.gather(*db_tasks)
185-
186-
18717
###########################################################################################################
18818
# ENDPOINTS
18919
###########################################################################################################
@@ -197,11 +27,14 @@ async def ingest_docs(credentials: HTTPAuthorizationCredentials = Depends(securi
19727
status_code=status.HTTP_401_UNAUTHORIZED,
19828
detail="You are not authorized"
19929
)
30+
31+
github = GithubService()
32+
await github.download_docs()
20033

20134
docs_dir = pathlib.Path(__file__).resolve().parents[3] / "docs"
20235

20336
try:
204-
file_paths = get_file_paths(docs_dir)
37+
file_paths = get_docs_file_paths(docs_dir)
20538
except FileNotFoundError as e:
20639
raise HTTPException(
20740
status_code=status.HTTP_404_NOT_FOUND,
@@ -217,7 +50,7 @@ async def ingest_docs(credentials: HTTPAuthorizationCredentials = Depends(securi
21750
abs_path = docs_dir / relative_path
21851
try:
21952
file_content = get_file_content(abs_path)
220-
await process_file_and_update_db(file_content, relative_path, supabase)
53+
await process_docs_file_and_update_db(file_content, relative_path, supabase)
22154
except (FileNotFoundError, IOError) as e:
22255
print(f"Skipping file due to error: {e}")
22356
continue
@@ -230,4 +63,47 @@ async def ingest_docs(credentials: HTTPAuthorizationCredentials = Depends(securi
23063

23164
return {
23265
"message": "Ingestion process successfully completed"
66+
}
67+
68+
69+
@router.post("/packages")
70+
async def ingest_packages(credentials: HTTPAuthorizationCredentials = Depends(security), supabase: AsyncClient = Depends(get_db_client)):
71+
72+
token = credentials.credentials
73+
if not token or token != os.getenv("ADMIN_KEY"):
74+
raise HTTPException(
75+
status_code=status.HTTP_401_UNAUTHORIZED,
76+
detail="You're not authorized"
77+
)
78+
79+
packages_docs_md_path = pathlib.Path(__file__).resolve().parents[6] / "mesh/docs/markdown"
80+
81+
try:
82+
files_path = get_packages_file_paths(packages_docs_md_path)
83+
except FileExistsError as e:
84+
raise HTTPException(
85+
status_code=status.HTTP_404_NOT_FOUND,
86+
detail=f"The packages docs folder was not found: {e}"
87+
)
88+
except IOError as e:
89+
raise HTTPException(
90+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
91+
details=f"An I/O error occured while accessing the packages docs directory: {e}"
92+
)
93+
94+
for abs_path in files_path:
95+
relative_path = str(pathlib.Path(abs_path).relative_to(packages_docs_md_path))
96+
try:
97+
file_content = get_file_content(abs_path)
98+
await process_package_docs_and_update_db(file_content, str(relative_path), supabase)
99+
except (FileNotFoundError, IOError) as e:
100+
print(f"Skipping the file '{relative_path}' due to an error: {e}")
101+
except Exception as e:
102+
raise HTTPException(
103+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
104+
detail=f"An error occured during the file ingestion: {e}"
105+
)
106+
107+
return {
108+
"message": "Ingestion process successful for package docs"
233109
}

apps/meshjs-rag/app/db/client.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from dotenv import load_dotenv
2-
load_dotenv()
31
import os
42
from supabase import acreate_client, AsyncClient as Client
53

apps/meshjs-rag/app/db/setup_db.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from dotenv import load_dotenv
2-
load_dotenv()
31
import os
42
import asyncpg
53
import asyncio

apps/meshjs-rag/app/services/github.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from dotenv import load_dotenv
2-
load_dotenv()
31
import os
42
import httpx
53
import asyncio
Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,46 @@
1-
def chunk_content_by_h2(content: str):
1+
from typing import List
2+
import re
3+
4+
def flush_chunk(current_chunk: List[str], chunks: List[str]):
5+
if current_chunk:
6+
chunks.append("\n".join(current_chunk).strip())
7+
8+
def chunk_content_by_h2(content: str) -> List[str]:
29
chunks = []
310
current_chunk = []
411
for line in content.splitlines():
512
if line.startswith("## ") and line.strip()[-6:] != "[!toc]":
6-
if current_chunk:
7-
chunks.append("\n".join(current_chunk).strip())
13+
flush_chunk(current_chunk, chunks)
814
current_chunk = [line]
915
else:
1016
current_chunk.append(line)
1117

12-
if current_chunk:
13-
chunks.append("/n".join(current_chunk).strip())
18+
flush_chunk(current_chunk, chunks)
19+
20+
return chunks
21+
22+
23+
def chunk_class_file(content: str) -> List[str]:
24+
chunks = []
25+
current_chunk = []
26+
lines = content.splitlines()
27+
28+
for i, line in enumerate(lines):
29+
if (
30+
line.startswith("## Constructors") \
31+
or (line.startswith("## Properties") and lines[i+2].startswith("###")) \
32+
or (line.startswith("## Methods") and lines[i+2].startswith("###"))
33+
):
34+
flush_chunk(current_chunk, chunks)
35+
current_chunk = [line]
36+
else:
37+
current_chunk.append(line)
38+
39+
flush_chunk(current_chunk, chunks)
40+
41+
if chunks and chunks[-1].startswith("## Methods"):
42+
chunk = chunks.pop()
43+
method_chunks = [method.strip() for method in chunk.split("***") if method.strip()]
44+
chunks.extend(method_chunks)
1445

15-
return chunks
46+
return chunks
Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,34 @@
1+
from typing import List
2+
13
def extract_chunk_title(chunk: str) -> str:
24
chunk = chunk.replace("/n", "\n") # what a culprit, ugh
35
for line in chunk.splitlines():
46
if line.startswith("## "):
57
return line.strip()[3:]
6-
elif line.startswith("title: "):
8+
if line.startswith("title: "):
79
return line.strip()[7:]
8-
else:
9-
return ""
10+
##
11+
finds = ["# Class: ", "# Function: ", "# Type Alias: ", "# Variable: ", "# Interface: "]
12+
if line.startswith(tuple(finds)):
13+
return line.strip()
14+
15+
return ""
16+
17+
def extract_class_chunk_title(chunk: str, chunks: List[str]) -> str:
18+
chunk = chunk.replace("/n", "\n")
19+
class_title = extract_chunk_title(chunks[0]) or "UnknownClass"
20+
21+
for line in chunk.splitlines():
22+
if line.startswith("# Class: "):
23+
return line.strip()
24+
if line.startswith("## Constructors"):
25+
return f"{class_title}'s constructor"
26+
if line.startswith("## Properties"):
27+
return f"{class_title}'s properties"
28+
if line.startswith("## Methods"):
29+
continue
30+
if line.startswith("### "):
31+
return f"{class_title}'s method: {line.removeprefix('### ').strip()}"
32+
33+
return ""
34+

0 commit comments

Comments
 (0)