Skip to content

Commit afc0c69

Browse files
committed
reformat code
1 parent f2ac68e commit afc0c69

File tree

15 files changed

+255
-191
lines changed

15 files changed

+255
-191
lines changed

finetuning/APPS/apps_dataset.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@ class APPSBaseDataset(torch.utils.data.Dataset):
1010
def __init__(self, dataset, max_tokens, tokenizer_path):
1111
self.dataset = dataset
1212
self.max_tokens = max_tokens
13-
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_auth_token=True)
13+
self.tokenizer = AutoTokenizer.from_pretrained(
14+
tokenizer_path, use_auth_token=True
15+
)
1416
self.samples = [] # Should be set in initialize()
1517

1618
self.initialize(self.tokenizer)

finetuning/APPS/apps_train.py

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,12 @@
44

55
import argparse
66
import os
7-
import torch
87

8+
import torch
99
from apps_dataset import APPSBaseDataset
1010
from datasets import load_dataset
11-
from transformers import (
12-
AutoModelForCausalLM,
13-
Trainer,
14-
TrainingArguments,
15-
logging,
16-
set_seed,
17-
)
11+
from transformers import (AutoModelForCausalLM, Trainer, TrainingArguments,
12+
logging, set_seed)
1813

1914

2015
def get_args():
@@ -59,22 +54,20 @@ def run_training(args, train_data, val_data):
5954
training_args = TrainingArguments(
6055
output_dir=args.output_dir,
6156
dataloader_drop_last=True,
62-
evaluation_strategy = "steps",
57+
evaluation_strategy="steps",
6358
num_train_epochs=args.num_epochs,
64-
max_steps = args.max_steps,
65-
eval_steps = args.eval_freq,
59+
max_steps=args.max_steps,
60+
eval_steps=args.eval_freq,
6661
save_steps=args.save_freq,
6762
logging_steps=args.log_freq,
68-
6963
per_device_train_batch_size=args.batch_size,
7064
per_device_eval_batch_size=args.batch_size,
7165
learning_rate=args.learning_rate,
7266
lr_scheduler_type=args.lr_scheduler_type,
73-
warmup_steps = args.num_warmup_steps,
67+
warmup_steps=args.num_warmup_steps,
7468
gradient_accumulation_steps=args.gradient_accumulation_steps,
7569
weight_decay=args.weight_decay,
7670
fp16=args.fp16,
77-
7871
run_name="apps-train",
7972
report_to="wandb",
8073
)
@@ -99,8 +92,14 @@ def main(args):
9992
dataset.shuffle(seed=args.seed)
10093
data = get_dataset(dataset, args)
10194
train_size = int(0.95 * len(data))
102-
train_data, val_data = torch.utils.data.random_split(data, [train_size, len(data) - train_size], generator=torch.Generator().manual_seed(args.seed))
103-
print(f"size of training data {len(train_data)}\nsize of validation data {len(val_data)}")
95+
train_data, val_data = torch.utils.data.random_split(
96+
data,
97+
[train_size, len(data) - train_size],
98+
generator=torch.Generator().manual_seed(args.seed),
99+
)
100+
print(
101+
f"size of training data {len(train_data)}\nsize of validation data {len(val_data)}"
102+
)
104103
run_training(args, train_data, val_data)
105104

106105

finetuning/Code-to-text/train.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,15 @@
11
import argparse
22

33
from datasets import load_dataset
4-
5-
from transformers import (
6-
AutoModelForSequenceClassification,
7-
AutoTokenizer,
8-
Trainer,
9-
TrainingArguments,
10-
set_seed,
11-
)
4+
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
5+
Trainer, TrainingArguments, set_seed)
126

137

148
def get_args():
159
parser = argparse.ArgumentParser()
16-
parser.add_argument("--model_ckpt", type=str, default="microsoft/unixcoder-base-nine")
10+
parser.add_argument(
11+
"--model_ckpt", type=str, default="microsoft/unixcoder-base-nine"
12+
)
1713
parser.add_argument("--language", type=str, default="Python")
1814
parser.add_argument("--max_length", type=int, default=1024)
1915
parser.add_argument("--num_epochs", type=int, default=5)
@@ -40,7 +36,9 @@ def main():
4036
print("Loading tokenizer and model")
4137
tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
4238
tokenizer.pad_token = tokenizer.eos_token
43-
model = AutoModelForSequenceClassification.from_pretrained(args.model_ckpt, num_labels=2)
39+
model = AutoModelForSequenceClassification.from_pretrained(
40+
args.model_ckpt, num_labels=2
41+
)
4442
model.config.pad_token_id = model.config.eos_token_id
4543

4644
if args.freeze:
@@ -49,13 +47,20 @@ def main():
4947

5048
def tokenize(example):
5149
if args.language == "Python":
52-
#remove docstring from code
50+
# remove docstring from code
5351
chunks = example["code"].split('"""')
5452
code = chunks[0].strip() + chunks[2]
5553
else:
5654
code = example["code"]
57-
inputs = tokenizer(code, padding="max_length", truncation=True, max_length=args.max_length)
58-
labels = tokenizer(example["docstring"], padding="max_length", truncation=True, max_length=args.max_length).input_ids
55+
inputs = tokenizer(
56+
code, padding="max_length", truncation=True, max_length=args.max_length
57+
)
58+
labels = tokenizer(
59+
example["docstring"],
60+
padding="max_length",
61+
truncation=True,
62+
max_length=args.max_length,
63+
).input_ids
5964
labels_with_ignore_index = []
6065
for labels_example in labels:
6166
labels_example = [label if label != 0 else -100 for label in labels_example]
@@ -99,10 +104,11 @@ def tokenize(example):
99104

100105
print("Training...")
101106
trainer.train()
102-
107+
103108
# push the model to the Hugging Face hub
104109
if args.push_to_hub:
105110
model.push_to_hub(args.model_hub_name)
106111

112+
107113
if __name__ == "__main__":
108114
main()

finetuning/CodeClone/train.py

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,17 @@
33

44
import numpy as np
55
from datasets import ClassLabel, load_dataset
6-
76
from evaluate import load
8-
from transformers import (
9-
AutoModelForSequenceClassification,
10-
AutoTokenizer,
11-
DataCollatorWithPadding,
12-
Trainer,
13-
TrainerCallback,
14-
TrainingArguments,
15-
set_seed,
16-
)
7+
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
8+
DataCollatorWithPadding, Trainer, TrainerCallback,
9+
TrainingArguments, set_seed)
1710

1811

1912
def get_args():
2013
parser = argparse.ArgumentParser()
21-
parser.add_argument("--model_ckpt", type=str, default="microsoft/unixcoder-base-nine")
14+
parser.add_argument(
15+
"--model_ckpt", type=str, default="microsoft/unixcoder-base-nine"
16+
)
2217
parser.add_argument("--max_length", type=int, default=1024)
2318
parser.add_argument("--num_epochs", type=int, default=5)
2419
parser.add_argument("--batch_size", type=int, default=6)
@@ -52,7 +47,9 @@ def __init__(self, trainer) -> None:
5247
def on_epoch_end(self, args, state, control, **kwargs):
5348
if control.should_evaluate:
5449
control_copy = deepcopy(control)
55-
self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
50+
self._trainer.evaluate(
51+
eval_dataset=self._trainer.train_dataset, metric_key_prefix="train"
52+
)
5653
return control_copy
5754

5855

@@ -61,21 +58,28 @@ def main():
6158
set_seed(args.seed)
6259

6360
ds = load_dataset("code_x_glue_cc_clone_detection_big_clone_bench")
64-
labels = ClassLabel(num_classes = 2, names=[True, False])
61+
labels = ClassLabel(num_classes=2, names=[True, False])
6562
ds = ds.cast_column("label", labels)
6663

6764
print("Loading tokenizer and model")
6865
tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
6966
tokenizer.pad_token = tokenizer.eos_token
70-
model = AutoModelForSequenceClassification.from_pretrained(args.model_ckpt, num_labels=2)
67+
model = AutoModelForSequenceClassification.from_pretrained(
68+
args.model_ckpt, num_labels=2
69+
)
7170
model.config.pad_token_id = model.config.eos_token_id
7271

7372
if args.freeze:
7473
for param in model.roberta.parameters():
7574
param.requires_grad = False
7675

7776
def tokenize(example):
78-
inputs = tokenizer(example["func1"], example["func2"], truncation=True, max_length=args.max_length)
77+
inputs = tokenizer(
78+
example["func1"],
79+
example["func2"],
80+
truncation=True,
81+
max_length=args.max_length,
82+
)
7983
return {
8084
"input_ids": inputs["input_ids"],
8185
"attention_mask": inputs["attention_mask"],
@@ -121,10 +125,11 @@ def tokenize(example):
121125

122126
result = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
123127
print(f"Evaluation accuracy on the test set: {result['eval_accuracy']}")
124-
128+
125129
# push the model to the Hugging Face hub
126130
if args.push_to_hub:
127131
model.push_to_hub(args.model_hub_name)
128132

133+
129134
if __name__ == "__main__":
130135
main()

finetuning/CodeComplex/train.py

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,17 @@
33

44
import numpy as np
55
from datasets import ClassLabel, DatasetDict, load_dataset
6-
76
from evaluate import load
8-
from transformers import (
9-
AutoModelForSequenceClassification,
10-
AutoTokenizer,
11-
DataCollatorWithPadding,
12-
Trainer,
13-
TrainerCallback,
14-
TrainingArguments,
15-
set_seed,
16-
)
7+
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
8+
DataCollatorWithPadding, Trainer, TrainerCallback,
9+
TrainingArguments, set_seed)
1710

1811

1912
def get_args():
2013
parser = argparse.ArgumentParser()
21-
parser.add_argument("--model_ckpt", type=str, default="microsoft/unixcoder-base-nine")
14+
parser.add_argument(
15+
"--model_ckpt", type=str, default="microsoft/unixcoder-base-nine"
16+
)
2217
parser.add_argument("--num_epochs", type=int, default=5)
2318
parser.add_argument("--batch_size", type=int, default=6)
2419
parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
@@ -51,7 +46,9 @@ def __init__(self, trainer) -> None:
5146
def on_epoch_end(self, args, state, control, **kwargs):
5247
if control.should_evaluate:
5348
control_copy = deepcopy(control)
54-
self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
49+
self._trainer.evaluate(
50+
eval_dataset=self._trainer.train_dataset, metric_key_prefix="train"
51+
)
5552
return control_copy
5653

5754

@@ -73,14 +70,18 @@ def main():
7370
print("Loading tokenizer and model")
7471
tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
7572
tokenizer.pad_token = tokenizer.eos_token
76-
model = AutoModelForSequenceClassification.from_pretrained(args.model_ckpt, num_labels=7)
73+
model = AutoModelForSequenceClassification.from_pretrained(
74+
args.model_ckpt, num_labels=7
75+
)
7776
model.config.pad_token_id = model.config.eos_token_id
7877

7978
if args.freeze:
8079
for param in model.roberta.parameters():
8180
param.requires_grad = False
8281

83-
labels = ClassLabel(num_classes=7, names=list(set(train_test_validation["train"]["complexity"])))
82+
labels = ClassLabel(
83+
num_classes=7, names=list(set(train_test_validation["train"]["complexity"]))
84+
)
8485

8586
def tokenize(example):
8687
inputs = tokenizer(example["src"], truncation=True, max_length=1024)
@@ -131,10 +132,11 @@ def tokenize(example):
131132

132133
result = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
133134
print(f"Evaluation accuracy on the test set: {result['eval_accuracy']}")
134-
135+
135136
# push the model to the Hugging Face hub
136137
if args.push_to_hub:
137138
model.push_to_hub(args.model_hub_name)
138139

140+
139141
if __name__ == "__main__":
140142
main()

finetuning/CodeDefect/train.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,17 @@
33

44
import numpy as np
55
from datasets import ClassLabel, load_dataset
6-
76
from evaluate import load
8-
from transformers import (
9-
AutoModelForSequenceClassification,
10-
AutoTokenizer,
11-
DataCollatorWithPadding,
12-
Trainer,
13-
TrainerCallback,
14-
TrainingArguments,
15-
set_seed,
16-
)
7+
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
8+
DataCollatorWithPadding, Trainer, TrainerCallback,
9+
TrainingArguments, set_seed)
1710

1811

1912
def get_args():
2013
parser = argparse.ArgumentParser()
21-
parser.add_argument("--model_ckpt", type=str, default="microsoft/unixcoder-base-nine")
14+
parser.add_argument(
15+
"--model_ckpt", type=str, default="microsoft/unixcoder-base-nine"
16+
)
2217
parser.add_argument("--max_length", type=int, default=1024)
2318
parser.add_argument("--num_epochs", type=int, default=5)
2419
parser.add_argument("--batch_size", type=int, default=6)
@@ -52,7 +47,9 @@ def __init__(self, trainer) -> None:
5247
def on_epoch_end(self, args, state, control, **kwargs):
5348
if control.should_evaluate:
5449
control_copy = deepcopy(control)
55-
self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
50+
self._trainer.evaluate(
51+
eval_dataset=self._trainer.train_dataset, metric_key_prefix="train"
52+
)
5653
return control_copy
5754

5855

@@ -61,14 +58,16 @@ def main():
6158
set_seed(args.seed)
6259

6360
ds = load_dataset("code_x_glue_cc_defect_detection")
64-
labels = ClassLabel(num_classes = 2, names=[True, False])
61+
labels = ClassLabel(num_classes=2, names=[True, False])
6562
ds = ds.cast_column("target", labels)
6663
ds = ds.rename_column("target", "label")
6764

6865
print("Loading tokenizer and model")
6966
tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
7067
tokenizer.pad_token = tokenizer.eos_token
71-
model = AutoModelForSequenceClassification.from_pretrained(args.model_ckpt, num_labels=2)
68+
model = AutoModelForSequenceClassification.from_pretrained(
69+
args.model_ckpt, num_labels=2
70+
)
7271
model.config.pad_token_id = model.config.eos_token_id
7372

7473
if args.freeze:
@@ -128,5 +127,6 @@ def tokenize(example):
128127
if args.push_to_hub:
129128
model.push_to_hub(args.model_hub_name)
130129

130+
131131
if __name__ == "__main__":
132132
main()

lm_eval/base.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
from abc import abstractmethod, ABC
2-
from datasets import load_dataset
1+
from abc import ABC, abstractmethod
32
from warnings import warn
43

4+
from datasets import load_dataset
5+
56

67
class Task(ABC):
78
"""A task represents an entire benchmark including its dataset, problems,

lm_eval/evaluator.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,9 @@ def evaluate(self, task_name):
7171
if self.args.save_generations:
7272
with open(self.args.save_generations_path, "w") as fp:
7373
json.dump(generations, fp)
74-
print(f"generations were saved at {self.args.save_generations_path}")
74+
print(
75+
f"generations were saved at {self.args.save_generations_path}"
76+
)
7577
if self.args.save_references:
7678
with open("references.json", "w") as fp:
7779
json.dump(references, fp)

0 commit comments

Comments
 (0)