Skip to content

Commit 3e8bb6f

Browse files
committed
embed aiken docs to db; make github service flexible; assign alternate chunk title for non-title
1 parent 9d49732 commit 3e8bb6f

File tree

6 files changed

+67
-16
lines changed

6 files changed

+67
-16
lines changed

apps/meshjs-rag/.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
.vscode/
33
.venv/
44
**/__pycache__/
5-
docs/
5+
docs/
6+
aiken-docs/

apps/meshjs-rag/app/api/v1/ingest.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ async def ingest_docs(credentials: HTTPAuthorizationCredentials = Depends(securi
2828
detail="You are not authorized"
2929
)
3030

31-
github = GithubService()
31+
github = GithubService(owner="MeshJS", repo="mimir", doc_path="apps/docs/content/docs", output_path="docs")
3232
await github.download_docs()
3333

3434
docs_dir = pathlib.Path(__file__).resolve().parents[3] / "docs"
@@ -106,4 +106,52 @@ async def ingest_packages(credentials: HTTPAuthorizationCredentials = Depends(se
106106

107107
return {
108108
"message": "Ingestion process successful for package docs"
109+
}
110+
111+
112+
@router.post("/aiken-docs")
113+
async def ingest_packages(credentials: HTTPAuthorizationCredentials = Depends(security), supabase: AsyncClient = Depends(get_db_client)):
114+
115+
token = credentials.credentials
116+
if not token or token != os.getenv("ADMIN_KEY"):
117+
raise HTTPException(
118+
status_code=status.HTTP_401_UNAUTHORIZED,
119+
detail="You're not authorized"
120+
)
121+
122+
github = GithubService(owner="aiken-lang", repo="site", doc_path="src/pages", output_path="aiken-docs")
123+
await github.download_docs()
124+
125+
aiken_docs_md_path = pathlib.Path(__file__).resolve().parents[3] / "aiken-docs"
126+
127+
try:
128+
file_paths = get_docs_file_paths(aiken_docs_md_path)
129+
except FileNotFoundError as e:
130+
raise HTTPException(
131+
status_code=status.HTTP_404_NOT_FOUND,
132+
detail=f"The documents directory was not found: {e}"
133+
)
134+
except IOError as e:
135+
raise HTTPException(
136+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
137+
detail=f"An I/O error occurred while accessing the documents directory: {e}"
138+
)
139+
140+
for relative_path in file_paths:
141+
abs_path = aiken_docs_md_path / relative_path
142+
try:
143+
file_content = get_file_content(abs_path)
144+
await process_docs_file_and_update_db(file_content, relative_path, supabase)
145+
except (FileNotFoundError, IOError) as e:
146+
print(f"Skipping file due to error: {e}")
147+
continue
148+
149+
except Exception as e:
150+
raise HTTPException(
151+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
152+
detail=f"An error occured during the file ingestion: {e}"
153+
)
154+
155+
return {
156+
"message": "Ingestion process successfully completed"
109157
}

apps/meshjs-rag/app/services/github.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44
import pathlib
55

66
class GithubService:
7-
def __init__(self):
7+
def __init__(self, owner, repo, doc_path, output_path):
88
self.base_url="https://api.github.com"
9-
self.owner="smutyala1at"
10-
self.repo="meshjs-docs"
11-
self.doc_path="content/docs"
12-
self.output_path="docs"
9+
self.owner=owner
10+
self.repo=repo
11+
self.doc_path=doc_path
12+
self.output_path=output_path
1313
self.token=os.getenv("GITHUB_TOKEN") or None
1414

1515
def _get_headers(self):
@@ -28,7 +28,7 @@ async def _fetch_github_dir(self,client: httpx.AsyncClient, remote_path: str):
2828
response = await client.get(url)
2929
response.raise_for_status()
3030
return response.json()
31-
except httpx.HTTPStatusException as e:
31+
except httpx.HTTPError as e:
3232
print(f"Error fetching the directory '{remote_path}': HTTP status {e.response.status_code}")
3333
return None
3434
except httpx.RequestError as e:
@@ -81,6 +81,6 @@ async def download_docs(self):
8181

8282

8383
if __name__ == "__main__":
84-
github = GithubService()
84+
github = GithubService(owner="MeshJS", repo="mimir", doc_path="apps/docs/content/docs", output_path="docs")
8585
asyncio.run(github.download_docs())
8686
print("Successfully downloaded docs from github")

apps/meshjs-rag/app/utils/chunk_content.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ def chunk_content_by_h2(content: str) -> List[str]:
99
chunks = []
1010
current_chunk = []
1111
for line in content.splitlines():
12-
if line.startswith("## ") and line.strip()[-6:] != "[!toc]":
12+
if line and line.startswith("## ") and not line.strip().endswith("[!toc]"):
1313
flush_chunk(current_chunk, chunks)
1414
current_chunk = [line]
1515
else:

apps/meshjs-rag/app/utils/process_chunks.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ async def process_chunks_and_update_db(
3030
chunk_title = title_extractor(chunk, idx, chunks)
3131

3232
if not chunk_title:
33-
continue
33+
filepath = relative_path.split("/")
34+
chunk_title = f"{filepath[-2]}/{filepath[-1]}_1" if len(filepath) > 1 else f"{relative_path}_1"
35+
3436
current_chunk_data[chunk_title] = {
3537
"chunk": chunk,
3638
"chunk_id": idx,
@@ -65,9 +67,6 @@ async def process_chunks_and_update_db(
6567
existing = existing_records.get(key)
6668

6769
if current and existing:
68-
if current["checksum"] == existing["checksum"]:
69-
print(f"Skipping the unchanged chunk: {chunk_title}")
70-
7170
if current["checksum"] != existing["checksum"]:
7271
print(f"Updating chunk: {chunk_title}")
7372
try:
@@ -91,13 +90,16 @@ async def process_chunks_and_update_db(
9190

9291
elif current["chunk_id"] != existing.get("chunk_id"):
9392
print(f"Updating chunk order for {chunk_title}")
94-
updated_title = chunk_title.split("_")[0] + f"_{current["chunk_id"]}" if "_" in chunk_title else chunk_title
93+
updated_title = chunk_title.split("_")[0] + f"_{current['chunk_id']}" if "_" in chunk_title else chunk_title
9594
await safe_db_operation(
9695
supabase.table("docs").update({"chunk_id": current["chunk_id"], "chunk_title": updated_title}).eq("id", existing["id"]).execute()
9796
)
9897
# reorder after update
9998
needs_reorder = True
10099

100+
else:
101+
print(f"Skipping the unchanged chunk: {chunk_title}")
102+
101103
elif current and not existing:
102104
print(f"New chunk {chunk_title}")
103105
try:

apps/meshjs-rag/app/utils/reorder_chunks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ async def reorder_chunks(
1717
if chunk["chunk_id"] != new_idx:
1818
updated_title = chunk["chunk_title"].split("_")[0] + f"_{new_idx}" if "_" in chunk["chunk_title"] else chunk["chunk_title"]
1919
reorder_chunks.append(
20-
await safe_db_operation(
20+
safe_db_operation(
2121
supabase.table("docs") \
2222
.update({"chunk_id": new_idx, "chunk_title": updated_title}) \
2323
.eq("id", chunk["id"]) \

0 commit comments

Comments
 (0)