Skip to content

Commit 63f4f33

Browse files
committed
revert untouched files
1 parent 282cdb8 commit 63f4f33

13 files changed

+23
-18
lines changed

eval/chat_benchmarks/MTBench/fastchat/data/clean_sharegpt.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
Usage:
66
python3 -m fastchat.data.clean_sharegpt --in sharegpt_html.json --out sharegpt_clean.json
77
"""
8-
98
import argparse
109
from concurrent.futures import ProcessPoolExecutor
1110
import json
@@ -20,7 +19,9 @@
2019

2120
div_pattern = re.compile("<div.*?>")
2221
span_pattern = re.compile("<span.*?>")
23-
code_lang_pattern = re.compile("```\s*" + "(.*?)" + "(?:Copy code)+" + "(.+?)" + "\s*?```", re.DOTALL)
22+
code_lang_pattern = re.compile(
23+
"```\s*" + "(.*?)" + "(?:Copy code)+" + "(.+?)" + "\s*?```", re.DOTALL
24+
)
2425
code_lang_format = "```\g<1>\n\g<2>\n```"
2526
regenerate_pattern = re.compile("\d+ / \d+")
2627
copy_chars_pattern = re.compile("Copy\d+ chars / \d+ words")
@@ -154,7 +155,9 @@ def clean_html_all(content, begin, end):
154155
content = content[begin:end]
155156
processed = []
156157
with ProcessPoolExecutor() as executor:
157-
for result in tqdm(executor.map(clean_html_one_sample, content), total=len(content)):
158+
for result in tqdm(
159+
executor.map(clean_html_one_sample, content), total=len(content)
160+
):
158161
processed.append(result)
159162

160163
visited = {}

eval/chat_benchmarks/MTBench/fastchat/data/extract_gpt4_only.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
44
Usage: python3 -m fastchat.data.extract_gpt4_only --in sharegpt.json
55
"""
6-
76
import argparse
87
import json
98

eval/chat_benchmarks/MTBench/fastchat/data/extract_single_round.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
44
Usage: python3 -m fastchat.data.extract_single_round --in sharegpt.json
55
"""
6-
76
import argparse
87
import json
98

eval/chat_benchmarks/MTBench/fastchat/data/filter_wrong_format.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
python3 -m fastchat.data.filter_wrong_format --in input.json --out output.json
66
77
"""
8-
98
import argparse
109
import json
1110
import re

eval/chat_benchmarks/MTBench/fastchat/data/get_stats.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ def tokenize_one_sample(c):
2626
def tokenize_dataset(content):
2727
processed = []
2828
with ProcessPoolExecutor() as executor:
29-
for result in tqdm(executor.map(tokenize_one_sample, content), total=len(content)):
29+
for result in tqdm(
30+
executor.map(tokenize_one_sample, content), total=len(content)
31+
):
3032
processed.append(result)
3133

3234
return processed
@@ -57,7 +59,9 @@ def compute_stats(content):
5759
if __name__ == "__main__":
5860
parser = argparse.ArgumentParser()
5961
parser.add_argument("--in-file", type=str)
60-
parser.add_argument("--model-name-or-path", type=str, default="meta-llama/Llama-2-7b-chat-hf")
62+
parser.add_argument(
63+
"--model-name-or-path", type=str, default="meta-llama/Llama-2-7b-chat-hf"
64+
)
6165
args = parser.parse_args()
6266

6367
content = json.load(open(args.in_file, "r"))

eval/chat_benchmarks/MTBench/fastchat/data/hardcoded_questions.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
22
Hardcoded question and answers.
33
"""
4-
54
import json
65

76

eval/chat_benchmarks/MTBench/fastchat/data/inspect_data.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
Usage:
33
python3 -m fastchat.data.inspect_data --in sharegpt_20230322_clean_lang_split.json
44
"""
5-
65
import argparse
76
import json
87
import random

eval/chat_benchmarks/MTBench/fastchat/data/optional_clean.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
Requirement:
99
pip3 install polyglot pyicu pycld2
1010
"""
11-
1211
import argparse
1312
import json
1413
import re

eval/chat_benchmarks/MTBench/fastchat/data/optional_replace.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
Requirement:
88
pip3 install transformers tqdm
99
"""
10-
1110
import argparse
1211
import json
1312
import traceback
@@ -16,7 +15,9 @@
1615
from tqdm import tqdm
1716

1817

19-
def replace_special_tokens(tokenizer: transformers.PreTrainedTokenizer, text: str) -> str:
18+
def replace_special_tokens(
19+
tokenizer: transformers.PreTrainedTokenizer, text: str
20+
) -> str:
2021
if not text:
2122
return text
2223

eval/chat_benchmarks/MTBench/fastchat/data/prepare_all.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,20 @@
99
if __name__ == "__main__":
1010
parser = argparse.ArgumentParser()
1111
parser.add_argument("--prefix", type=str, default="~/datasets/sharegpt_20230521")
12-
parser.add_argument("--model-name-or-path", type=str, default="meta-llama/Llama-2-7b-chat-hf")
12+
parser.add_argument(
13+
"--model-name-or-path", type=str, default="meta-llama/Llama-2-7b-chat-hf"
14+
)
1315
parser.add_argument("--seq-len", type=int, default=4096)
1416
args = parser.parse_args()
1517

1618
in_prefix = args.prefix
1719
model_path = args.model_name_or_path
1820
seq_len = args.seq_len
19-
prefix = f"{in_prefix}_{seq_len}".replace("4096", "4k").replace("8192", "8k").replace("16384", "16k")
21+
prefix = (
22+
f"{in_prefix}_{seq_len}".replace("4096", "4k")
23+
.replace("8192", "8k")
24+
.replace("16384", "16k")
25+
)
2026

2127
cmd_list = [
2228
f"python3 -m fastchat.data.clean_sharegpt --in {in_prefix}_html.json --out {prefix}_clean.json",

0 commit comments

Comments
 (0)