Skip to content

Commit 0210014

Browse files
authored
add missing preprocess file (#386)
Signed-off-by: Siddhant Ray <siddhant.r98@gmail.com>
1 parent a9cfe50 commit 0210014

File tree

1 file changed

+68
-0
lines changed

1 file changed

+68
-0
lines changed
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import argparse
2+
import json
3+
import os
4+
5+
import numpy as np
6+
from transformers import AutoTokenizer
7+
8+
parser = argparse.ArgumentParser(description="Process data percentage.")
9+
parser.add_argument(
10+
"--parse",
11+
type=float,
12+
default=1,
13+
help="The percentage of data to process (0 to 1). Default is 1 (100%).",
14+
)
15+
16+
args = parser.parse_args()
17+
18+
with open("ShareGPT_V3_unfiltered_cleaned_split.json", "r", encoding="utf-8") as file:
19+
data = json.load(file)
20+
21+
22+
def estimate_num_tokens(text: str) -> int:
23+
if not hasattr(estimate_num_tokens, "tokenizer"):
24+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
25+
estimate_num_tokens.tokenizer = AutoTokenizer.from_pretrained(
26+
"mistralai/Mistral-7B-Instruct-v0.2"
27+
)
28+
return len(estimate_num_tokens.tokenizer.tokenize(text))
29+
30+
31+
num_of_ids = len(data)
32+
print(f"Number of IDs: {num_of_ids}")
33+
data = data[: int(num_of_ids * args.parse)]
34+
35+
count = 0
36+
37+
for d in data:
38+
d["num_round"] = len(d["conversations"]) # human is one round, gpt is another round
39+
human_tokens = []
40+
gpt_tokens = []
41+
for conv in d["conversations"]:
42+
if conv["from"] == "human":
43+
human_tokens.append(estimate_num_tokens(conv["value"]))
44+
if conv["from"] == "gpt":
45+
token_number = estimate_num_tokens(conv["value"])
46+
conv["num_tokens"] = token_number
47+
gpt_tokens.append(token_number)
48+
if len(human_tokens) == 0:
49+
d["average_human_token"] = 0
50+
d["max_human_token"] = 0
51+
else:
52+
d["average_human_token"] = float(np.mean(human_tokens))
53+
d["max_human_token"] = float(np.max(human_tokens))
54+
if len(gpt_tokens) == 0:
55+
d["average_gpt_token"] = 0
56+
d["max_gpt_token"] = 0
57+
else:
58+
d["average_gpt_token"] = float(np.mean(gpt_tokens))
59+
d["max_gpt_token"] = float(np.max(gpt_tokens))
60+
61+
count += 1
62+
print(f"Finished {count}")
63+
64+
# Remove the data that has two consecutive human rounds
65+
del data[260]
66+
67+
with open("ShareGPT.json", "w", encoding="utf-8") as file:
68+
json.dump(data, file, ensure_ascii=False, indent=2)

0 commit comments

Comments
 (0)