Skip to content
This repository was archived by the owner on Oct 22, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions CLIENT/add_message.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import requests
import time

url = 'http://localhost:8000/add_message'

user = input('User: ')
message = input('Enter your message: ')
timestamp = time.time()

response = requests.post(url, params={
'message': message,
'speaker': user,
'timestamp': timestamp
})

print(response.status_code)
print(response.json())
9 changes: 9 additions & 0 deletions CLIENT/maintain_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import requests
import time

url = 'http://localhost:8000/maintain_tree'

response = requests.post(url)

print(response.status_code)
print(response.json())
9 changes: 9 additions & 0 deletions CLIENT/rebuild_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import requests
import time

url = 'http://localhost:8000/rebuild_tree'

response = requests.post(url)

print(response.status_code)
print(response.json())
11 changes: 11 additions & 0 deletions CLIENT/search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import requests
import time

url = 'http://localhost:8000/search'

query = input('search: ')

response = requests.get(url, params={'query': query})

print(response.status_code)
print(response.json())
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,18 @@ To run REMO, you will need the following:
- openai
- PyYAML

## Installation

Note: You may need to change tensorflow to tensowflow-macos in your requirements.txt file on certain OS X machines.

1. Run `pip install -r requirements.txt`
2. Create key_openai.txt file and put your OpenAI API key inside.

## Usage

1. Start the FastAPI server: `uvicorn remo:app --reload`
2. Interact with the API using a REST client or web browser: `http://localhost:8000`


## API Endpoints

- **POST /add_message**: Add a new message to REMO. Speaker, timestamp, and content required.
Expand Down
147 changes: 104 additions & 43 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,37 +10,44 @@
import tensorflow_hub as hub


embedding_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")
embedding_model = hub.load(
"https://tfhub.dev/google/universal-sentence-encoder-large/5")


def open_file(filepath):
with open(filepath, 'r', encoding='utf-8', errors='ignore') as infile:
return infile.read()


def save_file(filepath, content):
with open(filepath, 'w', encoding='utf-8') as outfile:
outfile.write(content)


def save_yaml(filepath, data):
with open(filepath, 'w', encoding='utf-8') as file:
yaml.dump(data, file, allow_unicode=True)


def load_yaml(filepath):
with open(filepath, 'r', encoding='utf-8') as file:
data = yaml.load(file, Loader=yaml.FullLoader)
return data


def create_message(message: str, speaker: str, timestamp: float) -> Dict[str, Any]:
# Create message dictionary
return {"content": message, "speaker": speaker, "timestamp": timestamp}


def save_message(root_folder, message: Dict[str, Any]):
timestamp, speaker = message['timestamp'], message['speaker']
filename = f"chat_{timestamp}_{speaker}.yaml"
filepath = os.path.join(root_folder, "L1_raw_logs", filename)
os.makedirs(os.path.dirname(filepath), exist_ok=True)
save_yaml(filepath, message)


def search_tree(root_folder, query):
# TODO add a "forks" parameter to allow for branching relevance
# TODO add a "fuzziness" parameter that can generate a random vector to modify the search query
Expand All @@ -55,13 +62,15 @@ def search_tree(root_folder, query):
level -= 1

while level > 2:
level_files = [os.path.join(level_dir, f) for f in os.listdir(level_dir) if f.endswith(".yaml")]
level_files = [os.path.join(level_dir, f) for f in os.listdir(
level_dir) if f.endswith(".yaml")]
max_similarity = -1
closest_file = None

for file in level_files:
data = load_yaml(file)
similarity = cosine_similarity(query_embedding, np.array(data["vector"]).reshape(1, -1))[0][0]
similarity = cosine_similarity(
query_embedding, np.array(data["vector"]).reshape(1, -1))[0][0]

if similarity > max_similarity:
max_similarity = similarity
Expand All @@ -82,9 +91,9 @@ def search_tree(root_folder, query):


def rebuild_tree(root_folder: str, max_cluster_size: int = 10):
# Delete all folders except L1_raw_logs, L2_message_pairs and .git
# Delete all folders except L1_raw_logs, L2_message_pairs, .git, CLIENT and REMO
for folder_name in os.listdir(root_folder):
if folder_name not in {"L1_raw_logs", "L2_message_pairs", ".git"}:
if folder_name not in {"L1_raw_logs", "L2_message_pairs", ".git", "CLIENT", "REMO"}:
folder_path = os.path.join(root_folder, folder_name)
if os.path.isdir(folder_path):
shutil.rmtree(folder_path)
Expand All @@ -98,22 +107,26 @@ def rebuild_tree(root_folder: str, max_cluster_size: int = 10):
process_missing_messages(root_folder)

# Cluster L2 message pairs using cosine similarity, up to 10 per cluster
clusters = cluster_elements(root_folder, "L2_message_pairs", max_cluster_size)
clusters = cluster_elements(
root_folder, "L2_message_pairs", max_cluster_size)

# Create summaries and save them in the next rank (L3_summaries)
create_summaries(root_folder, clusters, f"L3_summaries", "L2_message_pairs")
create_summaries(root_folder, clusters,
f"L3_summaries", "L2_message_pairs")

# If top rank (e.g. L3_summaries) has > max_cluster_size files, repeat process, creating new taxonomical ranks
current_rank = 3
while True:
# calculate clusters at new rank
clusters = cluster_elements(root_folder, f"L{current_rank}_summaries", max_cluster_size)

clusters = cluster_elements(
root_folder, f"L{current_rank}_summaries", max_cluster_size)

# summarize those clusters
create_summaries(root_folder, clusters, f"L{current_rank + 1}_summaries", f"L{current_rank}_summaries")
create_summaries(root_folder, clusters,
f"L{current_rank + 1}_summaries", f"L{current_rank}_summaries")
current_rank += 1
# if clusters less than max cluster size, we are done :)

# if clusters less than max cluster size, we are done :)
if len(clusters) <= max_cluster_size:
break

Expand Down Expand Up @@ -157,9 +170,11 @@ def process_missing_messages(root_folder: str):
}

# Save message pair in L2_message_pairs folder
message_pair_path = os.path.join(message_pairs_dir, message_pair_filename)
message_pair_path = os.path.join(
message_pairs_dir, message_pair_filename)
save_yaml(message_pair_path, message_pair_data)


def create_summaries(root_folder: str, clusters: List[List[str]], target_folder: str, source_folder: str):
source_folder_path = os.path.join(root_folder, source_folder)
target_folder_path = os.path.join(root_folder, target_folder)
Expand Down Expand Up @@ -219,84 +234,94 @@ def cluster_elements(root_folder: str, target_folder: str, max_cluster_size: int

return clusters


def maintain_tree(root_folder: str):
l2_message_pairs_dir = os.path.join(root_folder, "L2_message_pairs")

# Create L2 directory if it does not exist
if not os.path.exists(l2_message_pairs_dir):
os.makedirs(l2_message_pairs_dir)

# Get list of files in L2 before processing missing messages
l2_files_before = set(os.listdir(l2_message_pairs_dir))

# Process missing messages to generate new message pairs in L2
process_missing_messages(root_folder)

# Get list of files in L2 after processing missing messages
l2_files_after = set(os.listdir(l2_message_pairs_dir))

# Calculate the difference between the two lists to obtain the new message pairs
new_message_pairs = l2_files_after - l2_files_before
#new_message_pairs = [os.path.join("L2_message_pairs", f) for f in l2_files_after - l2_files_before]
# new_message_pairs = [os.path.join("L2_message_pairs", f) for f in l2_files_after - l2_files_before]

# Iterate through new files in L2 and check cosine similarity to files in L3
integrate_new_elements(root_folder, "L3_summaries", new_message_pairs, 0.75)
integrate_new_elements(root_folder, "L3_summaries",
new_message_pairs, 0.75)


def integrate_new_elements(root_folder: str, target_folder: str, new_elements: List[str], threshold: float):
target_dir = os.path.join(root_folder, target_folder)

# Create target directory if it does not exist
if not os.path.exists(target_dir):
os.makedirs(target_dir)

for new_element in new_elements:
new_element_path = os.path.join(root_folder, "L2_message_pairs", new_element)
new_element_path = os.path.join(
root_folder, "L2_message_pairs", new_element)
new_element_data = load_yaml(new_element_path)
new_element_vector = np.array(new_element_data["vector"]).reshape(1, -1)

new_element_vector = np.array(
new_element_data["vector"]).reshape(1, -1)

max_similarity = -1
closest_file = None

for file in os.listdir(target_dir):
file_path = os.path.join(target_dir, file)
file_data = load_yaml(file_path)
file_vector = np.array(file_data["vector"]).reshape(1, -1)

similarity = cosine_similarity(new_element_vector, file_vector)[0][0]

similarity = cosine_similarity(
new_element_vector, file_vector)[0][0]
if similarity > max_similarity:
max_similarity = similarity
closest_file = file

if max_similarity > threshold:
# Update the corresponding summary and record the name of the modified file
closest_file_path = os.path.join(target_dir, closest_file)
closest_file_data = load_yaml(closest_file_path)
closest_file_data["files"].append(new_element)

combined_content = closest_file_data["content"] + " --- " + new_element_data["content"]

combined_content = closest_file_data["content"] + \
" --- " + new_element_data["content"]
updated_summary = quick_summarize(combined_content)
updated_summary_embedding = embedding_model([updated_summary]).numpy().tolist()

updated_summary_embedding = embedding_model(
[updated_summary]).numpy().tolist()

closest_file_data["content"] = updated_summary
closest_file_data["vector"] = updated_summary_embedding
closest_file_data["timestamp"] = time()

save_yaml(closest_file_path, closest_file_data)
else:
# Create a new summary for the new_element
combined_content = new_element_data["content"]
new_summary = quick_summarize(combined_content)
new_summary_embedding = embedding_model([new_summary]).numpy().tolist()

new_summary_embedding = embedding_model(
[new_summary]).numpy().tolist()

new_summary_data = {
"content": new_summary,
"vector": new_summary_embedding,
"files": [new_element],
"timestamp": time()
}

new_summary_filename = f"summary_{len(os.listdir(target_dir))}.yaml"
new_summary_filepath = os.path.join(target_dir, new_summary_filename)
new_summary_filepath = os.path.join(
target_dir, new_summary_filename)
save_yaml(new_summary_filepath, new_summary_data)


Expand All @@ -305,27 +330,28 @@ def quick_summarize(text):

if len(text) <= max_chunk_size:
prompt = 'Write a detailed summary of the following:\n\n%s\n\nDETAILED SUMMARY:' % text
response = gpt3_completion(prompt)
response = gpt3_5_turbo_chat(prompt)
return response
else:
# Split the text into evenly sized chunks
num_chunks = int(np.ceil(len(text) / max_chunk_size))
chunk_size = int(np.ceil(len(text) / num_chunks))
text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
text_chunks = [text[i:i + chunk_size]
for i in range(0, len(text), chunk_size)]

# Summarize each chunk
summaries = []
for chunk in text_chunks:
prompt = 'Write a detailed summary of the following:\n\n%s\n\nDETAILED SUMMARY:' % chunk
response = gpt3_completion(prompt)
response = gpt3_5_turbo_chat(prompt)
summaries.append(response)

# Concatenate the summaries and return the result
final_summary = " ".join(summaries)
return final_summary


def gpt3_completion(prompt, engine='text-davinci-003', temp=0.0, top_p=1.0, tokens=1000, freq_pen=0.0, pres_pen=0.0, stop=['asdfasdfasdf']):
""" def gpt3_completion(prompt, engine='text-davinci-003', temp=0.0, top_p=1.0, tokens=1000, freq_pen=0.0, pres_pen=0.0, stop=['asdfasdfasdf']):
openai.api_key = open_file('key_openai.txt')
max_retry = 5
retry = 0
Expand Down Expand Up @@ -354,4 +380,39 @@ def gpt3_completion(prompt, engine='text-davinci-003', temp=0.0, top_p=1.0, toke
if retry >= max_retry:
return "GPT3 error: %s" % oops
print('Error communicating with OpenAI:', oops)
sleep(1) """


def gpt3_5_turbo_chat(prompt, model='gpt-3.5-turbo', temp=0.0, top_p=1.0, tokens=1000, freq_pen=0.0, pres_pen=0.0, stop=['asdfasdfasdf']):
openai.api_key = open_file('key_openai.txt')
max_retry = 5
retry = 0
prompt = prompt.encode(encoding='ASCII', errors='ignore').decode()

messages = [{'role': 'user', 'content': prompt}]

while True:
try:
response = openai.ChatCompletion.create(
model=model,
messages=messages,
temperature=temp,
max_tokens=tokens,
top_p=top_p,
frequency_penalty=freq_pen,
presence_penalty=pres_pen,
stop=stop
)
text = response['choices'][0]['message']['content'].strip()
filename = '%s_gpt3_5_turbo.txt' % time()
if not os.path.exists('gpt3_logs'):
os.makedirs('gpt3_logs')
save_file('gpt3_logs/%s' % filename, prompt +
'\n\n==========\n\n' + text)
return text
except Exception as oops:
retry += 1
if retry >= max_retry:
return "GPT-3.5 Turbo error: %s" % oops
print('Error communicating with OpenAI:', oops)
sleep(1)