1- from llama_index .core import Document , Settings , SimpleDirectoryReader , StorageContext , VectorStoreIndex
2- from llama_index .core .node_parser import SentenceSplitter , CodeSplitter , MarkdownNodeParser , JSONNodeParser
1+ from llama_index .core import (
2+ Document ,
3+ Settings ,
4+ SimpleDirectoryReader ,
5+ StorageContext ,
6+ VectorStoreIndex ,
7+ )
8+ from llama_index .core .node_parser import (
9+ SentenceSplitter ,
10+ CodeSplitter ,
11+ MarkdownNodeParser ,
12+ JSONNodeParser ,
13+ )
314from llama_index .vector_stores .elasticsearch import ElasticsearchStore
415from dotenv import load_dotenv
516from llama_index .embeddings .openai import OpenAIEmbedding
1728import glob
1829import os
1930
20- #logging.basicConfig(stream=sys.stdout, level=logging.INFO)
21- #logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
22- #logging.getLogger("elasticsearch").setLevel(logging.DEBUG)
2331
2432nest_asyncio .apply ()
2533
@@ -38,16 +46,19 @@ def clone_repository(owner, repo, branch, base_path="/tmp"):
3846 raise ValueError (
3947 "Branch is not provided and GITHUB_BRANCH environment variable is not set."
4048 )
41-
49+
50+
4251 local_repo_path = os .path .join (base_path , owner , repo )
4352 clone_url = f"https://github.com/{ owner } /{ repo } .git"
44-
53+
54+
4555 if os .path .exists (local_repo_path ):
4656 print (f"Repository already exists at { local_repo_path } . Skipping clone." )
4757 return local_repo_path
4858
4959 attempts = 3
50-
60+
61+
5162 for attempt in range (attempts ):
5263 try :
5364 os .makedirs (local_repo_path , exist_ok = True )
@@ -65,6 +76,7 @@ def clone_repository(owner, repo, branch, base_path="/tmp"):
6576 else :
6677 raise Exception ("Failed to clone repository after multiple attempts" )
6778
79+
6880def print_docs_and_nodes (docs , nodes ):
6981 print ("\n === Documents ===\n " )
7082 for doc in docs :
@@ -76,11 +88,13 @@ def print_docs_and_nodes(docs, nodes):
7688 print (f"Node ID: { node .id_ } " )
7789 print (f"Node Content:\n { node .text } \n \n ---\n " )
7890
91+
7992def collect_and_print_file_summary (file_summary ):
8093 print ("\n === File Summary ===\n " )
8194 for summary in file_summary :
8295 print (summary )
8396
97+
8498def parse_documents ():
8599 owner = os .getenv ("GITHUB_OWNER" )
86100 repo = os .getenv ("GITHUB_REPO" )
@@ -91,7 +105,8 @@ def parse_documents():
91105 raise ValueError (
92106 "GITHUB_OWNER and GITHUB_REPO environment variables must be set."
93107 )
94-
108+
109+
95110 local_repo_path = clone_repository (owner , repo , branch , base_path )
96111
97112 nodes = []
@@ -127,7 +142,8 @@ def parse_documents():
127142 file_summary .append (
128143 f"Found { len (matching_files )} { extension_list } files in the repository."
129144 )
130-
145+
146+
131147 loader = SimpleDirectoryReader (
132148 input_dir = local_repo_path , required_exts = extensions , recursive = True
133149 )
@@ -166,7 +182,7 @@ def get_es_vector_store():
166182 return es_vector_store
167183 except elastic_transport .ConnectionTimeout :
168184 print (f"Connection attempt { attempt + 1 } /{ retries } timed out. Retrying..." )
169- time .sleep (10 )
185+ time .sleep (10 )
170186 raise Exception ("Failed to initialize Elasticsearch store after multiple attempts" )
171187
172188
0 commit comments