1919
2020#logging.basicConfig(stream=sys.stdout, level=logging.INFO)
2121#logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
22- #logging.getLogger(' elasticsearch' ).setLevel(logging.DEBUG)
22+ #logging.getLogger(" elasticsearch" ).setLevel(logging.DEBUG)
2323
2424nest_asyncio .apply ()
2525
26- load_dotenv (' .env' )
26+ load_dotenv (" .env" )
2727
2828Settings .embed_model = OpenAIEmbedding (model = "text-embedding-3-large" )
2929Settings .chunk_lines = 1024
3535def clone_repository (owner , repo , branch , base_path = "/tmp" ):
3636 branch = branch or os .getenv ("GITHUB_BRANCH" )
3737 if not branch :
38- raise ValueError ("Branch is not provided and GITHUB_BRANCH environment variable is not set." )
38+ raise ValueError (
39+ "Branch is not provided and GITHUB_BRANCH environment variable is not set."
40+ )
3941
4042 local_repo_path = os .path .join (base_path , owner , repo )
4143 clone_url = f"https://github.com/{ owner } /{ repo } .git"
@@ -50,7 +52,9 @@ def clone_repository(owner, repo, branch, base_path="/tmp"):
5052 try :
5153 os .makedirs (local_repo_path , exist_ok = True )
5254 print (f"Attempting to clone repository... Attempt { attempt + 1 } " )
53- subprocess .run (["git" , "clone" , "-b" , branch , clone_url , local_repo_path ], check = True )
55+ subprocess .run (
56+ ["git" , "clone" , "-b" , branch , clone_url , local_repo_path ], check = True
57+ )
5458 print (f"Repository cloned into { local_repo_path } ." )
5559 return local_repo_path
5660 except subprocess .CalledProcessError :
@@ -78,53 +82,63 @@ def collect_and_print_file_summary(file_summary):
7882 print (summary )
7983
8084def parse_documents ():
81- owner = os .getenv (' GITHUB_OWNER' )
82- repo = os .getenv (' GITHUB_REPO' )
83- branch = os .getenv (' GITHUB_BRANCH' )
84- base_path = os .getenv (' BASE_PATH' , "/tmp" )
85+ owner = os .getenv (" GITHUB_OWNER" )
86+ repo = os .getenv (" GITHUB_REPO" )
87+ branch = os .getenv (" GITHUB_BRANCH" )
88+ base_path = os .getenv (" BASE_PATH" , "/tmp" )
8589
8690 if not owner or not repo :
87- raise ValueError ("GITHUB_OWNER and GITHUB_REPO environment variables must be set." )
91+ raise ValueError (
92+ "GITHUB_OWNER and GITHUB_REPO environment variables must be set."
93+ )
8894
8995 local_repo_path = clone_repository (owner , repo , branch , base_path )
9096
9197 nodes = []
9298 file_summary = []
9399
94- ts_parser = get_parser (' typescript' )
95- py_parser = get_parser (' python' )
96- go_parser = get_parser ('go' )
97- js_parser = get_parser (' javascript' )
98- bash_parser = get_parser (' bash' )
99- yaml_parser = get_parser (' yaml' )
100+ ts_parser = get_parser (" typescript" )
101+ py_parser = get_parser (" python" )
102+ go_parser = get_parser ("go" )
103+ js_parser = get_parser (" javascript" )
104+ bash_parser = get_parser (" bash" )
105+ yaml_parser = get_parser (" yaml" )
100106
101107 parsers_and_extensions = [
102108 (SentenceSplitter (), [".md" ]),
103- (CodeSplitter (language = ' python' , parser = py_parser ), [".py" , ".ipynb" ]),
104- (CodeSplitter (language = ' typescript' , parser = ts_parser ), [".ts" ]),
105- (CodeSplitter (language = 'go' , parser = go_parser ), [".go" ]),
106- (CodeSplitter (language = ' javascript' , parser = js_parser ), [".js" ]),
107- (CodeSplitter (language = ' bash' , parser = bash_parser ), [".bash" , ",sh" ]),
108- (CodeSplitter (language = ' yaml' , parser = yaml_parser ), [".yaml" , ".yml" ]),
109+ (CodeSplitter (language = " python" , parser = py_parser ), [".py" , ".ipynb" ]),
110+ (CodeSplitter (language = " typescript" , parser = ts_parser ), [".ts" ]),
111+ (CodeSplitter (language = "go" , parser = go_parser ), [".go" ]),
112+ (CodeSplitter (language = " javascript" , parser = js_parser ), [".js" ]),
113+ (CodeSplitter (language = " bash" , parser = bash_parser ), [".bash" , ",sh" ]),
114+ (CodeSplitter (language = " yaml" , parser = yaml_parser ), [".yaml" , ".yml" ]),
109115 (JSONNodeParser (), [".json" ]),
110116 ]
111117
112118 for parser , extensions in parsers_and_extensions :
113119 matching_files = []
114120 for ext in extensions :
115- matching_files .extend (glob .glob (f"{ local_repo_path } /**/*{ ext } " , recursive = True ))
121+ matching_files .extend (
122+ glob .glob (f"{ local_repo_path } /**/*{ ext } " , recursive = True )
123+ )
116124
117125 if len (matching_files ) > 0 :
118- file_summary .append (f"Found { len (matching_files )} { ', ' .join (extensions )} files in the repository." )
119- loader = SimpleDirectoryReader (input_dir = local_repo_path , required_exts = extensions , recursive = True )
126+ file_summary .append (
127+ f"Found { len (matching_files )} { ", " .join (extensions )} files in the repository."
128+ )
129+ loader = SimpleDirectoryReader (
130+ input_dir = local_repo_path , required_exts = extensions , recursive = True
131+ )
120132 docs = loader .load_data ()
121133 parsed_nodes = parser .get_nodes_from_documents (docs )
122134
123135 print_docs_and_nodes (docs , parsed_nodes )
124136
125137 nodes .extend (parsed_nodes )
126138 else :
127- file_summary .append (f"No { ', ' .join (extensions )} files found in the repository." )
139+ file_summary .append (
140+ f"No { ", " .join (extensions )} files found in the repository."
141+ )
128142
129143 collect_and_print_file_summary (file_summary )
130144 print ("\n " )
@@ -144,7 +158,7 @@ def get_es_vector_store():
144158 es_cloud_id = es_cloud_id ,
145159 es_user = es_user ,
146160 es_password = es_password ,
147- batch_size = 100
161+ batch_size = 100 ,
148162 )
149163 print ("Elasticsearch store initialized." )
150164 return es_vector_store
0 commit comments