-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathutils.py
More file actions
59 lines (44 loc) · 1.97 KB
/
utils.py
File metadata and controls
59 lines (44 loc) · 1.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from typing import TypedDict
from langchain_core.documents import Document
from config import GraphConfig
def format_document(entry: dict) -> Document:
"""
Combine pre_text, post_text, and table content into a single text block.
HTML is a decent way to represent tables.
References:
- Study that shows GPT4 performs performs best with HTML: https://arxiv.org/html/2305.13062v4
- Discussion where a few people mentioned they had the best results with HTML: https://news.ycombinator.com/item?id=41043771
"""
# Combine pre_text, post_text, and table content into a single text block
combined_text = ""
combined_text += "\n".join(entry["pre_text"])
# Process the table to include in the text block as HTML
table_html = "<table>\n"
for row in entry["table"]:
table_html += " <tr>\n"
for cell in row:
table_html += f" <td>{cell}</td>\n"
table_html += " </tr>\n"
table_html += "</table>"
combined_text += "\n\n" + table_html
combined_text += "\n\n" + "\n".join(entry["post_text"])
# Combine all text and table data
full_text = combined_text + "\n\n" + "Table Data:\n" + table_html
return Document(
id=entry["id"],
page_content=full_text,
metadata={"id": entry["id"], "qa": str(entry.get("qa"))},
)
# Apply Llama3.1 chat-template
def format_prompt(user_query: str):
"""
Apply Llama3.1 chat-template.
Args:
user_query (str): The user query.
References:
- Llama3.1 chat-template: https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1
"""
template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"""
return template.format(user_query)
def typed_dict_to_dict(x) -> dict:
return {k: v for k, v in x.__dict__.items() if not k.startswith('__')}