Skip to content

Commit 74572f1

Browse files
committed
Adding github assistant code
1 parent 53801c3 commit 74572f1

File tree

4 files changed

+398
-0
lines changed

4 files changed

+398
-0
lines changed
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
import logging
2+
import sys
3+
import os
4+
import pandas as pd
5+
from dotenv import load_dotenv
6+
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Response
7+
from llama_index.core.evaluation import (
8+
DatasetGenerator,
9+
RelevancyEvaluator,
10+
FaithfulnessEvaluator,
11+
EvaluationResult,
12+
)
13+
from llama_index.llms.openai import OpenAI
14+
from tabulate import tabulate
15+
import textwrap
16+
import argparse
17+
import traceback
18+
from httpx import ReadTimeout
19+
20+
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
21+
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
22+
23+
parser = argparse.ArgumentParser(description='Process documents and questions for evaluation.')
24+
parser.add_argument('--num_documents', type=int, default=None,
25+
help='Number of documents to process (default: all)')
26+
parser.add_argument('--skip_documents', type=int, default=0,
27+
help='Number of documents to skip at the beginning (default: 0)')
28+
parser.add_argument('--num_questions', type=int, default=None,
29+
help='Number of questions to process (default: all)')
30+
parser.add_argument('--skip_questions', type=int, default=0,
31+
help='Number of questions to skip at the beginning (default: 0)')
32+
parser.add_argument('--process_last_questions', action='store_true',
33+
help='Process last N questions instead of first N')
34+
args = parser.parse_args()
35+
36+
load_dotenv('.env')
37+
38+
reader = SimpleDirectoryReader("/tmp/elastic/production-readiness-review")
39+
documents = reader.load_data()
40+
print(f"First document: {documents[0].text}")
41+
print(f"Second document: {documents[1].text}")
42+
print(f"Thrid document: {documents[2].text}")
43+
44+
45+
if args.skip_documents > 0:
46+
documents = documents[args.skip_documents:]
47+
48+
if args.num_documents is not None:
49+
documents = documents[:args.num_documents]
50+
51+
print(f"Number of documents loaded: {len(documents)}")
52+
53+
llm = OpenAI(model="gpt-4o", request_timeout=120)
54+
55+
data_generator = DatasetGenerator.from_documents(documents, llm=llm)
56+
57+
try:
58+
eval_questions = data_generator.generate_questions_from_nodes()
59+
if isinstance(eval_questions, str):
60+
eval_questions_list = eval_questions.strip().split('\n')
61+
else:
62+
eval_questions_list = eval_questions
63+
eval_questions_list = [q for q in eval_questions_list if q.strip()]
64+
65+
if args.skip_questions > 0:
66+
eval_questions_list = eval_questions_list[args.skip_questions:]
67+
68+
if args.num_questions is not None:
69+
if args.process_last_questions:
70+
eval_questions_list = eval_questions_list[-args.num_questions:]
71+
else:
72+
eval_questions_list = eval_questions_list[:args.num_questions]
73+
74+
print("\All available questions generated:")
75+
for idx, q in enumerate(eval_questions):
76+
print(f"{idx}. {q}")
77+
78+
print("\nGenerated questions:")
79+
for idx, q in enumerate(eval_questions_list, start=1):
80+
print(f"{idx}. {q}")
81+
except ReadTimeout as e:
82+
print("Request to Ollama timed out during question generation. Please check the server or increase the timeout duration.")
83+
traceback.print_exc()
84+
sys.exit(1)
85+
except Exception as e:
86+
print(f"An error occurred while generating questions: {e}")
87+
traceback.print_exc()
88+
sys.exit(1)
89+
90+
print(f"\nTotal number of questions generated: {len(eval_questions_list)}")
91+
92+
evaluator_relevancy = RelevancyEvaluator(llm=llm)
93+
evaluator_faith = FaithfulnessEvaluator(llm=llm)
94+
95+
vector_index = VectorStoreIndex.from_documents(documents)
96+
97+
def display_eval_df(
98+
query: str,
99+
response: Response,
100+
eval_result_relevancy: EvaluationResult,
101+
eval_result_faith: EvaluationResult,
102+
) -> None:
103+
relevancy_feedback = getattr(eval_result_relevancy, 'feedback', '')
104+
relevancy_passing = getattr(eval_result_relevancy, 'passing', False)
105+
relevancy_passing_str = 'Pass' if relevancy_passing else 'Fail'
106+
107+
relevancy_score = 1.0 if relevancy_passing else 0.0
108+
109+
faithfulness_feedback = getattr(eval_result_faith, 'feedback', '')
110+
faithfulness_passing_bool = getattr(eval_result_faith, 'passing', False)
111+
faithfulness_passing = 'Pass' if faithfulness_passing_bool else 'Fail'
112+
113+
def wrap_text(text, width=50):
114+
if text is None:
115+
return ''
116+
text = str(text)
117+
text = text.replace('\r', '')
118+
lines = text.split('\n')
119+
wrapped_lines = []
120+
for line in lines:
121+
wrapped_lines.extend(textwrap.wrap(line, width=width))
122+
wrapped_lines.append('')
123+
return '\n'.join(wrapped_lines)
124+
125+
if response.source_nodes:
126+
source_content = wrap_text(response.source_nodes[0].node.get_content())
127+
else:
128+
source_content = ''
129+
130+
eval_data = {
131+
"Query": wrap_text(query),
132+
"Response": wrap_text(str(response)),
133+
"Source": source_content,
134+
"Relevancy Response": relevancy_passing_str,
135+
"Relevancy Feedback": wrap_text(relevancy_feedback),
136+
"Relevancy Score": wrap_text(str(relevancy_score)),
137+
"Faith Response": faithfulness_passing,
138+
"Faith Feedback": wrap_text(faithfulness_feedback),
139+
}
140+
141+
eval_df = pd.DataFrame([eval_data])
142+
143+
print("\nEvaluation Result:")
144+
print(tabulate(eval_df, headers='keys', tablefmt='grid', showindex=False, stralign='left'))
145+
146+
query_engine = vector_index.as_query_engine(llm=llm)
147+
148+
total_questions = len(eval_questions_list)
149+
for idx, question in enumerate(eval_questions_list, start=1):
150+
try:
151+
response_vector = query_engine.query(question)
152+
eval_result_relevancy = evaluator_relevancy.evaluate_response(
153+
query=question, response=response_vector
154+
)
155+
eval_result_faith = evaluator_faith.evaluate_response(
156+
response=response_vector
157+
)
158+
159+
print(f"\nProcessing Question {idx} of {total_questions}:")
160+
display_eval_df(question, response_vector, eval_result_relevancy, eval_result_faith)
161+
except ReadTimeout as e:
162+
print(f"Request to OpenAI timed out while processing question {idx}.")
163+
traceback.print_exc()
164+
continue
165+
except Exception as e:
166+
print(f"An error occurred while processing question {idx}: {e}")
167+
traceback.print_exc()
168+
continue
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
from llama_index.core import Document, Settings, SimpleDirectoryReader, StorageContext, VectorStoreIndex
2+
from llama_index.core.node_parser import SentenceSplitter, CodeSplitter, MarkdownNodeParser, JSONNodeParser
3+
from llama_index.vector_stores.elasticsearch import ElasticsearchStore
4+
from dotenv import load_dotenv
5+
from llama_index.embeddings.openai import OpenAIEmbedding
6+
from llama_index.core.ingestion import IngestionPipeline
7+
import tree_sitter_python as tspython
8+
from tree_sitter_languages import get_parser, get_language
9+
from tree_sitter import Parser, Language
10+
import logging
11+
import nest_asyncio
12+
import elastic_transport
13+
import sys
14+
import subprocess
15+
import shutil
16+
import time
17+
import glob
18+
import os
19+
20+
#logging.basicConfig(stream=sys.stdout, level=logging.INFO)
21+
#logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
22+
#logging.getLogger('elasticsearch').setLevel(logging.DEBUG)
23+
24+
nest_asyncio.apply()
25+
26+
load_dotenv('.env')
27+
28+
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-large")
29+
Settings.chunk_lines = 1024
30+
Settings.chunk_size = 1024
31+
Settings.chunk_lines_overlap = 20
32+
Settings.max_chars = 1500
33+
34+
35+
def clone_repository(owner, repo, branch, base_path="/tmp"):
36+
branch = branch or os.getenv("GITHUB_BRANCH")
37+
if not branch:
38+
raise ValueError("Branch is not provided and GITHUB_BRANCH environment variable is not set.")
39+
40+
local_repo_path = os.path.join(base_path, owner, repo)
41+
clone_url = f"https://github.com/{owner}/{repo}.git"
42+
43+
if os.path.exists(local_repo_path):
44+
print(f"Repository already exists at {local_repo_path}. Skipping clone.")
45+
return local_repo_path
46+
47+
attempts = 3
48+
49+
for attempt in range(attempts):
50+
try:
51+
os.makedirs(local_repo_path, exist_ok=True)
52+
print(f"Attempting to clone repository... Attempt {attempt + 1}")
53+
subprocess.run(["git", "clone", "-b", branch, clone_url, local_repo_path], check=True)
54+
print(f"Repository cloned into {local_repo_path}.")
55+
return local_repo_path
56+
except subprocess.CalledProcessError:
57+
print(f"Attempt {attempt + 1} failed, retrying...")
58+
time.sleep(10)
59+
if attempt < attempts - 1:
60+
continue
61+
else:
62+
raise Exception("Failed to clone repository after multiple attempts")
63+
64+
def print_docs_and_nodes(docs, nodes):
65+
print("\n=== Documents ===\n")
66+
for doc in docs:
67+
print(f"Document ID: {doc.doc_id}")
68+
print(f"Document Content:\n{doc.text}\n\n---\n")
69+
70+
print("\n=== Nodes ===\n")
71+
for node in nodes:
72+
print(f"Node ID: {node.id_}")
73+
print(f"Node Content:\n{node.text}\n\n---\n")
74+
75+
def collect_and_print_file_summary(file_summary):
76+
print("\n=== File Summary ===\n")
77+
for summary in file_summary:
78+
print(summary)
79+
80+
def parse_documents():
81+
owner = os.getenv('GITHUB_OWNER')
82+
repo = os.getenv('GITHUB_REPO')
83+
branch = os.getenv('GITHUB_BRANCH')
84+
base_path = os.getenv('BASE_PATH', "/tmp")
85+
86+
if not owner or not repo:
87+
raise ValueError("GITHUB_OWNER and GITHUB_REPO environment variables must be set.")
88+
89+
local_repo_path = clone_repository(owner, repo, branch, base_path)
90+
91+
nodes = []
92+
file_summary = []
93+
94+
ts_parser = get_parser('typescript')
95+
py_parser = get_parser('python')
96+
go_parser = get_parser('go')
97+
js_parser = get_parser('javascript')
98+
bash_parser = get_parser('bash')
99+
yaml_parser = get_parser('yaml')
100+
101+
parsers_and_extensions = [
102+
(SentenceSplitter(), [".md"]),
103+
(CodeSplitter(language='python', parser=py_parser), [".py", ".ipynb"]),
104+
(CodeSplitter(language='typescript', parser=ts_parser), [".ts"]),
105+
(CodeSplitter(language='go', parser=go_parser), [".go"]),
106+
(CodeSplitter(language='javascript', parser=js_parser), [".js"]),
107+
(CodeSplitter(language='bash', parser=bash_parser), [".bash", ",sh"]),
108+
(CodeSplitter(language='yaml', parser=yaml_parser), [".yaml", ".yml"]),
109+
(JSONNodeParser(), [".json"]),
110+
]
111+
112+
for parser, extensions in parsers_and_extensions:
113+
matching_files = []
114+
for ext in extensions:
115+
matching_files.extend(glob.glob(f"{local_repo_path}/**/*{ext}", recursive=True))
116+
117+
if len(matching_files) > 0:
118+
file_summary.append(f"Found {len(matching_files)} {', '.join(extensions)} files in the repository.")
119+
loader = SimpleDirectoryReader(input_dir=local_repo_path, required_exts=extensions, recursive=True)
120+
docs = loader.load_data()
121+
parsed_nodes = parser.get_nodes_from_documents(docs)
122+
123+
print_docs_and_nodes(docs, parsed_nodes)
124+
125+
nodes.extend(parsed_nodes)
126+
else:
127+
file_summary.append(f"No {', '.join(extensions)} files found in the repository.")
128+
129+
collect_and_print_file_summary(file_summary)
130+
print("\n")
131+
return nodes
132+
133+
def get_es_vector_store():
134+
print("Initializing Elasticsearch store...")
135+
es_cloud_id = os.getenv("ELASTIC_CLOUD_ID")
136+
es_user = os.getenv("ELASTIC_USER")
137+
es_password = os.getenv("ELASTIC_PASSWORD")
138+
index_name = os.getenv("ELASTIC_INDEX")
139+
retries = 20
140+
for attempt in range(retries):
141+
try:
142+
es_vector_store = ElasticsearchStore(
143+
index_name=index_name,
144+
es_cloud_id=es_cloud_id,
145+
es_user=es_user,
146+
es_password=es_password,
147+
batch_size=100
148+
)
149+
print("Elasticsearch store initialized.")
150+
return es_vector_store
151+
except elastic_transport.ConnectionTimeout:
152+
print(f"Connection attempt {attempt + 1}/{retries} timed out. Retrying...")
153+
time.sleep(10)
154+
raise Exception("Failed to initialize Elasticsearch store after multiple attempts")
155+
156+
def main():
157+
nodes = parse_documents()
158+
es_vector_store = get_es_vector_store()
159+
160+
try:
161+
pipeline = IngestionPipeline(
162+
vector_store=es_vector_store,
163+
)
164+
165+
pipeline.run(documents=nodes, show_progress=True)
166+
finally:
167+
if hasattr(es_vector_store, "close"):
168+
es_vector_store.close()
169+
print("Elasticsearch connection closed.")
170+
171+
if __name__ == "__main__":
172+
main()
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import asyncio
2+
from llama_index.core import VectorStoreIndex, QueryBundle, Settings
3+
from llama_index.llms.openai import OpenAI
4+
from llama_index.embeddings.openai import OpenAIEmbedding
5+
from index import get_es_vector_store
6+
import httpx
7+
8+
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
9+
Settings.embed_model = embed_model
10+
11+
def run_query_sync():
12+
query = input("Please enter your query: ")
13+
14+
openai_llm = OpenAI(model="gpt-4o")
15+
16+
es_vector_store = get_es_vector_store()
17+
index = VectorStoreIndex.from_vector_store(es_vector_store)
18+
19+
try:
20+
query_engine = index.as_query_engine(
21+
llm=openai_llm,
22+
similarity_top_k=3,
23+
streaming=False,
24+
response_mode="tree_summarize"
25+
)
26+
27+
bundle = QueryBundle(query, embedding=embed_model.get_query_embedding(query))
28+
29+
result = query_engine.query(bundle)
30+
return result.response
31+
except Exception as e:
32+
print(f"An error occurred while running the query: {e}")
33+
finally:
34+
if hasattr(openai_llm, 'client') and isinstance(openai_llm.client, httpx.Client):
35+
openai_llm.client.close()
36+
if hasattr(embed_model, 'client') and isinstance(embed_model.client, httpx.Client):
37+
embed_model.client.close()
38+
if hasattr(es_vector_store, "close"):
39+
es_vector_store.close()
40+
print("Elasticsearch connection closed.")
41+
42+
if __name__ == "__main__":
43+
try:
44+
result = run_query_sync()
45+
print(result)
46+
except Exception as e:
47+
print(f"An error occurred: {e}")

0 commit comments

Comments
 (0)