-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_loader.py
More file actions
70 lines (55 loc) · 1.96 KB
/
data_loader.py
File metadata and controls
70 lines (55 loc) · 1.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import datasets as hf_datasets
from torch.utils.data import DataLoader, Dataset
from transformers import PreTrainedTokenizerFast as ptf
import preprocessor as pp
class CodeDataset(Dataset):
def __init__(self, data: hf_datasets.Dataset):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, index):
return pp.preprocess_record(self.data[index])
class CodeInputCollator:
def __init__(self, tokenizer, max_len):
self.tokenizer = tokenizer
self.max_len = max_len
def __call__(self, batch):
batch = [example for example in batch if example is not None]
inputs = [example["description"] for example in batch]
outputs = [example["code"] for example in batch]
tokens = self.tokenizer(
inputs, outputs,
return_tensors="pt",
truncation=True,
padding=True,
max_length=self.max_len
)
return {
"input_ids": tokens["input_ids"],
"attention_mask": tokens["attention_mask"],
}
def get_data_loaders(tokenizer: ptf, max_len: int) -> tuple[DataLoader, DataLoader, DataLoader]:
dataset = hf_datasets.load_dataset("code_search_net", "python", trust_remote_code=True)
train_dataset = CodeDataset(dataset["train"])
val_dataset = CodeDataset(dataset["validation"])
test_dataset = CodeDataset(dataset["test"])
codeInputCollator = CodeInputCollator(tokenizer, max_len=max_len)
train_dataloader = DataLoader(
train_dataset,
batch_size=8,
shuffle=True,
collate_fn=codeInputCollator
)
val_dataloader = DataLoader(
val_dataset,
batch_size=8,
shuffle=False,
collate_fn=codeInputCollator
)
test_dataloader = DataLoader(
test_dataset,
batch_size=8,
shuffle=False,
collate_fn=codeInputCollator
)
return train_dataloader, val_dataloader, test_dataloader