11import os
2- from dotenv import load_dotenv
3- load_dotenv ()
42from fastapi import APIRouter , HTTPException , status , Depends
53from fastapi .security import HTTPBearer , HTTPAuthorizationCredentials
64import pathlib
7- import asyncio
8- import openai
95from supabase import AsyncClient
10- from tenacity import RetryError
116
12- from app .utils .get_file_paths import get_file_paths
7+ from app .services .github import GithubService
8+ from app .utils .get_file_paths import get_docs_file_paths , get_packages_file_paths
139from app .utils .get_file_content import get_file_content
14- from app .utils .chunk_content import chunk_content_by_h2
15- from app .services .openai import OpenAIService
1610from app .db .client import get_db_client
17- from app .utils .checksum import calculate_checksum
18- from app .utils .extract_title import extract_chunk_title
19-
20- openai_api_key = os .getenv ("OPENAI_KEY" ) or None
21- if openai_api_key is None :
22- raise ValueError ("OpenAI api key is missing" )
11+ from app .utils .process_docs_file_and_update_db import process_docs_file_and_update_db
12+ from app .utils .process_package_docs_and_update_db import process_package_docs_and_update_db
2313
2414router = APIRouter ()
25- openai_service = OpenAIService (openai_api_key = openai_api_key )
2615security = HTTPBearer ()
2716
28- ###########################################################################################################
29- # HELPER FUNCTIONS
30- ###########################################################################################################
31-
32- async def safe_db_operation (task ):
33- try :
34- await task
35- except Exception as e :
36- print (f"Database operation failed: { e } " )
37- return None
38- return True
39-
40-
41- async def process_file_and_update_db (file_content : str , relative_path : str , supabase : AsyncClient ):
42- chunks = chunk_content_by_h2 (file_content )
43- cache_key = calculate_checksum (file_content )
44- current_chunk_data = {}
45-
46- # current chunks
47- for idx , chunk in enumerate (chunks ):
48- chunk_title = extract_chunk_title (chunk )
49-
50- if not chunk_title :
51- continue
52- current_chunk_data [chunk_title ] = {
53- "chunk" : chunk ,
54- "chunk_id" : idx ,
55- "checksum" : calculate_checksum (chunk )
56- }
57-
58- # existing chunks
59- try :
60- response = await supabase .table ("docs" ) \
61- .select ("id" , "chunk_title" , "chunk_id" , "checksum" ) \
62- .eq ("filepath" , relative_path ) \
63- .execute ()
64-
65- existing_records = {
66- record ["chunk_title" ]: record
67- for record in response .data
68- }
69- except Exception as e :
70- print (f"Failed to fetch existing records for { relative_path } : { e } " )
71- return
72-
73- # compare
74- chunks_to_embed = []
75- db_operations = []
76-
77- all_keys = set (current_chunk_data .keys ()) | set (existing_records .keys ())
78-
79- for key in all_keys :
80- chunk_title = key
81- current = current_chunk_data .get (key )
82- existing = existing_records .get (key )
83-
84- if current and existing :
85- if current ["checksum" ] == existing ["checksum" ]:
86- print (f"Skipping the unchanged chunk: { chunk_title } " )
87-
88- if current ["checksum" ] != existing ["checksum" ]:
89- print (f"Updating chunk: { chunk_title } " )
90- try :
91- response = await openai_service .situate_context (file_content , current ["chunk" ], cache_key = cache_key )
92- await asyncio .sleep (1 )
93- contextual_chunk = "---" .join ([response , current ["chunk" ]])
94- chunks_to_embed .append (contextual_chunk )
95-
96- db_operations .append ({
97- "filepath" : relative_path ,
98- "chunk_id" : current ["chunk_id" ],
99- "chunk_title" : chunk_title ,
100- "checksum" : current ["checksum" ],
101- "content" : current ["chunk" ],
102- "record_id" : existing ["id" ],
103- "is_update" : True
104- })
105- except (openai .APIError , openai .AuthenticationError , openai .RateLimitError , RetryError ) as e :
106- print (f"Skipping chunk { chunk_title } due to OpenAI 'situation_context' API error: { e } " )
107- continue
108-
109- elif current ["chunk_id" ] != existing .get ("chunk_id" ):
110- print (f"Updating chunk order for { chunk_title } " )
111- await safe_db_operation (
112- supabase .table ("docs" ).update ({"chunk_id" : current ["chunk_id" ]}).eq ("id" , existing ["id" ]).execute ()
113- )
114-
115- elif current and not existing :
116- print (f"New chunk { chunk_title } " )
117- try :
118- response = await openai_service .situate_context (file_content , current ["chunk" ], cache_key = cache_key )
119- await asyncio .sleep (1 )
120- contextual_chunk = "---" .join ([response , current ["chunk" ]])
121- chunks_to_embed .append (contextual_chunk )
122- db_operations .append ({
123- "filepath" : relative_path ,
124- "chunk_id" : current ["chunk_id" ],
125- "chunk_title" : chunk_title ,
126- "checksum" : current ["checksum" ],
127- "content" : current ["chunk" ],
128- "is_update" : False
129- })
130- except (openai .APIError , openai .AuthenticationError , openai .RateLimitError , RetryError ) as e :
131- print (f"Skipping chunk { chunk_title } due to OpenAI 'situation_context' API error: { e } " )
132- continue
133-
134- elif not current and existing :
135- print (f"Deleting chunk: { chunk_title } " )
136- await safe_db_operation (
137- supabase .table ("docs" ).delete ().eq ("id" , existing ["id" ]).execute ()
138- )
139-
140- if chunks_to_embed :
141- try :
142- embeddings = await openai_service .get_batch_embeddings (chunks_to_embed )
143- except (openai .APIError , openai .AuthenticationError , openai .RateLimitError , RetryError ) as e :
144- print (f"Skipping all DB operations for this file due to failed embedding batch: { e } " )
145- return
146-
147- db_tasks = []
148- for i , embedding in enumerate (embeddings ):
149- if not embedding :
150- print (f"Skipping DB operation for chunk '{ db_operations [i ]["chunk_title" ]} ' due to failed embedding" )
151- continue
152-
153- operation_data = db_operations [i ]
154-
155- if operation_data ["is_update" ]:
156- db_tasks .append (
157- safe_db_operation (
158- supabase .table ("docs" ).update ({
159- "content" : operation_data ["content" ],
160- "contextual_text" : chunks_to_embed [i ],
161- "embedding" : embedding ,
162- "filepath" : operation_data ["filepath" ],
163- "chunk_id" : operation_data ["chunk_id" ],
164- "chunk_title" : operation_data ["chunk_title" ],
165- "checksum" : operation_data ["checksum" ]
166- }).eq ("id" , operation_data ["record_id" ]).execute ()
167- )
168- )
169- else :
170- db_tasks .append (
171- safe_db_operation (
172- supabase .table ("docs" ).insert ({
173- "content" : operation_data ["content" ],
174- "contextual_text" : chunks_to_embed [i ],
175- "embedding" : embedding ,
176- "filepath" : operation_data ["filepath" ],
177- "chunk_id" : operation_data ["chunk_id" ],
178- "chunk_title" : operation_data ["chunk_title" ],
179- "checksum" : operation_data ["checksum" ]
180- }).execute ()
181- )
182- )
183-
184- await asyncio .gather (* db_tasks )
185-
186-
18717###########################################################################################################
18818# ENDPOINTS
18919###########################################################################################################
@@ -197,11 +27,14 @@ async def ingest_docs(credentials: HTTPAuthorizationCredentials = Depends(securi
19727 status_code = status .HTTP_401_UNAUTHORIZED ,
19828 detail = "You are not authorized"
19929 )
30+
31+ github = GithubService ()
32+ await github .download_docs ()
20033
20134 docs_dir = pathlib .Path (__file__ ).resolve ().parents [3 ] / "docs"
20235
20336 try :
204- file_paths = get_file_paths (docs_dir )
37+ file_paths = get_docs_file_paths (docs_dir )
20538 except FileNotFoundError as e :
20639 raise HTTPException (
20740 status_code = status .HTTP_404_NOT_FOUND ,
@@ -217,7 +50,7 @@ async def ingest_docs(credentials: HTTPAuthorizationCredentials = Depends(securi
21750 abs_path = docs_dir / relative_path
21851 try :
21952 file_content = get_file_content (abs_path )
220- await process_file_and_update_db (file_content , relative_path , supabase )
53+ await process_docs_file_and_update_db (file_content , relative_path , supabase )
22154 except (FileNotFoundError , IOError ) as e :
22255 print (f"Skipping file due to error: { e } " )
22356 continue
@@ -230,4 +63,47 @@ async def ingest_docs(credentials: HTTPAuthorizationCredentials = Depends(securi
23063
23164 return {
23265 "message" : "Ingestion process successfully completed"
66+ }
67+
68+
69+ @router .post ("/packages" )
70+ async def ingest_packages (credentials : HTTPAuthorizationCredentials = Depends (security ), supabase : AsyncClient = Depends (get_db_client )):
71+
72+ token = credentials .credentials
73+ if not token or token != os .getenv ("ADMIN_KEY" ):
74+ raise HTTPException (
75+ status_code = status .HTTP_401_UNAUTHORIZED ,
76+ detail = "You're not authorized"
77+ )
78+
79+ packages_docs_md_path = pathlib .Path (__file__ ).resolve ().parents [6 ] / "mesh/docs/markdown"
80+
81+ try :
82+ files_path = get_packages_file_paths (packages_docs_md_path )
83+ except FileExistsError as e :
84+ raise HTTPException (
85+ status_code = status .HTTP_404_NOT_FOUND ,
86+ detail = f"The packages docs folder was not found: { e } "
87+ )
88+ except IOError as e :
89+ raise HTTPException (
90+ status_code = status .HTTP_500_INTERNAL_SERVER_ERROR ,
91+ details = f"An I/O error occured while accessing the packages docs directory: { e } "
92+ )
93+
94+ for abs_path in files_path :
95+ relative_path = str (pathlib .Path (abs_path ).relative_to (packages_docs_md_path ))
96+ try :
97+ file_content = get_file_content (abs_path )
98+ await process_package_docs_and_update_db (file_content , str (relative_path ), supabase )
99+ except (FileNotFoundError , IOError ) as e :
100+ print (f"Skipping the file '{ relative_path } ' due to an error: { e } " )
101+ except Exception as e :
102+ raise HTTPException (
103+ status_code = status .HTTP_500_INTERNAL_SERVER_ERROR ,
104+ detail = f"An error occured during the file ingestion: { e } "
105+ )
106+
107+ return {
108+ "message" : "Ingestion process successful for package docs"
233109 }
0 commit comments