-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdata_loader.py
More file actions
107 lines (95 loc) · 4.06 KB
/
data_loader.py
File metadata and controls
107 lines (95 loc) · 4.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import pandas as pd
from datasets import Dataset
import json
from .prompt import *
from .data_processing import *
# def get_data(train_path1, dep_path, train_path2=None, type="grpo"):
# df = read_amr(train_path1)
# if train_path2:
# df2 = read_amr(train_path2)
# df = pd.concat([df, df2], ignore_index=True)
# with open(dep_path, "r", encoding="utf-8") as f_json:
# deps_data = json.load(f_json)
# if len(df) != len(deps_data):
# print("Warning: Dataframe and dependencies list have different lengths.")
# def process_df(df, deps_list):
# processed = []
# for idx, row in df.iterrows():
# dep_entry = deps_list[idx]
# sentence = dep_entry["sentence"]
# dep_info = dep_entry["dependency"]
# dep_str = str(dep_info)
# if type == "grpo":
# user_prompt = (
# f"Chuyển câu sau thành biểu diễn AMR dạng PENMAN.\n"
# f"Câu: {sentence}\n"
# f"Dependency: {dep_str}"
# )
# prompt = [
# {"role": "system", "content": SYSTEM_PROMPT},
# {"role": "user", "content": user_prompt}
# ]
# processed.append({
# "prompt": prompt,
# "answers": row['actions']
# })
# else:
# user_prompt = (
# f"{SYSTEM_PROMPT}\n\n"
# f"Chuyển câu sau thành biểu diễn AMR dạng PENMAN.\n"
# f"Câu: {sentence}\n"
# f"Dependency: {dep_str}"
# )
# processed.append({
# "prompt": [{"role": "user", "content": user_prompt}],
# "completion": [
# {"role": "assistant", "content": f"<answer>{row['actions']}</answer>"}
# ]
# })
# return Dataset.from_list(processed)
# train_dataset = process_df(df, deps_data)
# return train_dataset
def get_data(train_path1, train_path2=None, type="grpo"):
df = read_amr_direct(train_path1)
if train_path2:
df2 = read_amr_direct(train_path2)
df = pd.concat([df, df2], ignore_index=True)
def process_df(df):
processed = []
max_length_input = 0
max_length_output = 0
for idx, row in df.iterrows():
sentence = row["query"]
user_prompt = (
f"Chuyển câu sau thành biểu diễn AMR dạng chuỗi PENMAN một dòng theo đúng quy tắc trên."
f"Câu: {sentence}\n"
)
max_length_input = max(max_length_input, len(user_prompt.split(" ")))
max_length_output = max(max_length_output, len(row['amr'].split(" ")))
prompt = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_prompt}
]
if type == "grpo":
processed.append({
"prompt": prompt,
"answers": row['amr']
})
else:
# user_prompt = (
# f"{SYSTEM_PROMPT}\n\n"
# f"Chuyển câu sau thành biểu diễn AMR dạng chuỗi PENMAN một dòng theo đúng quy tắc trên."
# f"Câu: {sentence}\n"
# )
# max_length_input = max(max_length_input, len(user_prompt.split(" ")))
# max_length_output = max(max_length_output, len(row['amr'].split(" ")))
processed.append({
"prompt": prompt,
"completion": [
{"role": "assistant", "content": f"<answer>{row['amr']}</answer>"}
]
})
print(f"Max input length: {max_length_input}, Max output length: {max_length_output}")
return Dataset.from_list(processed)
train_dataset = process_df(df)
return train_dataset