Skip to content

Commit 413cf93

Browse files
G
1 parent e3f3649 commit 413cf93

File tree

305 files changed

+64494
-123
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

305 files changed

+64494
-123
lines changed

ML/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,15 @@
1818
from torchvision import transforms
1919
from torchvision.models import *
2020
from tqdm import tqdm
21-
from wandb import AlertLevel
21+
from wandb import *
2222
from torch.nn import *
2323
from torchvision.models import *
2424
import torchtext
2525
from torchtext.transforms import *
2626
from torchtext.models import *
27-
from sklearn.metrics import classification_report
27+
from sklearn.metrics import *
28+
from torch.hub import *
29+
import torchtext.functional as F
2830

2931
print(torch.__version__, torchvision.__version__, torchtext.__version__)
3032
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
@@ -40,5 +42,4 @@
4042

4143
from ML.dataset import *
4244
from ML.helper_functions import *
43-
from ML.metrics import *
4445
from ML.modelling import *

ML/dataset/loader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33

44
class Loader(Dataset):
5-
def __init__(self, path: str, transform=None) -> None:
5+
def __init__(self, path: str, transform: bool = None) -> None:
66
self.path = path
77
self.transform = transform
8-
self.data: pd.DataFrame = pd.read_csv(self.path)
8+
self.data: pd.DataFrame = pd.read_csv(self.path).iloc[:5000]
99

1010
def __len__(self) -> int:
1111
return len(self.data)

ML/dataset/main_loaders.py

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,17 @@
11
from ML import *
2+
from ML.dataset.loader import *
23

34

45
class Main_DL(Loader):
5-
def __init__(self, train: bool = True, test_split: float = 0.125, seed: int = 42) -> None:
6-
super().__init__()
6+
def __init__(
7+
self,
8+
train: bool = True,
9+
test_split: float = 0.125,
10+
seed: int = 42,
11+
batch_size: int = 32,
12+
**kwargs
13+
) -> None:
14+
super().__init__(**kwargs)
715
self.X = self.data["text"].to_numpy()
816
self.y = self.data["target"].to_numpy()
917
self.train = train
@@ -12,16 +20,44 @@ def __init__(self, train: bool = True, test_split: float = 0.125, seed: int = 42
1220
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
1321
self.X, self.y, test_size=test_split, random_state=seed
1422
)
23+
self.X_train, self.X_test, self.y_train, self.y_test = (
24+
np.array(self.X_train),
25+
np.array(self.X_test),
26+
np.array(self.y_train),
27+
np.array(self.y_test),
28+
)
29+
self.batch_size = batch_size
30+
# self.get_batches()
31+
32+
33+
def get_batches(self):
34+
X = self.X_train if self.train else self.X_test
35+
y = self.y_train if self.train else self.y_test
36+
X_batches = []
37+
y_batches = []
38+
for i in range(0, len(X), self.batch_size):
39+
X_iter = X[i : i + self.batch_size]
40+
y_iter = y[i : i + self.batch_size]
41+
X_batches.append(X_iter)
42+
y_batches.append(y_iter)
43+
if self.train:
44+
self.X_train = F.to_tensor(X_batches, padding_value=1)
45+
self.y_train = np.array(y_batches)
46+
else:
47+
self.X_test = F.to_tensor(X_batches, padding_value=1)
48+
self.y_test = np.array(y_batches)
49+
50+
print(X_batches[0], y_batches[0])
1551

1652
def __getitem__(self, index) -> Tuple[torch.tensor, torch.tensor]:
1753
if self.train:
1854
return (
19-
self.transform(self.X_train[index]) if self.transform else self.X_train[index],
20-
self.y_train[index],
55+
self.transform(self.X_train[index]),
56+
[self.y_train[index]],
2157
)
2258
return (
23-
self.transform(self.X_test[index]) if self.transform else self.X_test[index],
24-
self.y_test[index],
59+
self.transform(self.X_test[index]),
60+
[self.y_test[index]],
2561
)
2662

2763
def __len__(self) -> int:

ML/dataset/valid_loaders.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
from ML import *
2+
from ML.dataset.loader import *
23

34

45
class Valid_Loader(Loader):
5-
def __init__(self) -> None:
6-
super().__init__()
6+
def __init__(self, *args) -> None:
7+
super().__init__(*args)
78
self.X = self.data["text"].to_numpy()
89

910
def __getitem__(self, index) -> np.array:
10-
return self.X[index]
11+
return self.transform(self.X[index])

ML/helper_functions/load_data.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,33 +26,31 @@ def __init__(
2626
def ld(self) -> Tuple[DataLoader, DataLoader, DataLoader]:
2727
self.train_data_loader = DataLoader(
2828
self.dataset_main(
29-
self.main_path,
30-
self.main_transform,
29+
path=self.main_path,
30+
transform=self.main_transform,
3131
train=True,
3232
test_split=self.test_split,
3333
seed=self.seed,
3434
),
35-
batch_size=self.main_batch_size,
35+
batch_size=None,
3636
shuffle=True,
3737
num_workers=round(os.cpu_count() / 2),
3838
)
3939
self.test_data_loader = DataLoader(
4040
self.dataset_main(
41-
self.main_path,
42-
self.main_transform,
41+
path=self.main_path,
42+
transform=self.main_transform,
4343
train=False,
4444
test_split=self.test_split,
4545
seed=self.seed,
4646
),
47-
batch_size=self.main_batch_size,
47+
batch_size=None,
4848
shuffle=True,
4949
num_workers=round(os.cpu_count() / 2),
5050
)
5151
self.valid_data_loader = DataLoader(
52-
self.dataset_valid(
53-
self.valid_path,
54-
),
55-
batch_size=self.valid_batch_size,
52+
self.dataset_valid(self.valid_path, None),
53+
batch_size=None,
5654
shuffle=False,
5755
num_workers=round(os.cpu_count() / 2),
5856
)

ML/helper_functions/test.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ def test(self):
2424
n = 0
2525
with torch.inference_mode():
2626
for X, y in self.test_dataloader:
27+
X = torch.tensor(X).to("cuda").view(1, -1)
28+
y = torch.tensor(y).to("cuda")
2729
preds = torch.argmax(torch.softmax(self.model(X), dim=1), dim=1)
2830
results = classification_report(preds, y, class_names=["0", "1"])
2931
precision = results["weighted avg"]["precision"]
@@ -38,6 +40,6 @@ def test(self):
3840
return {
3941
f"{self.name} precision": p_tot / n,
4042
f"{self.name} recall": r_tot / n,
41-
f"{self.name} f1-score": f_tot / n,
43+
f"{self.name} f1-score": f1_tot / n,
4244
f"{self.name} accuracy": a_tot / n,
4345
}

ML/helper_functions/train.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from ML import *
2+
import torchtext.functional as F
23

34

45
class Train:
@@ -12,6 +13,7 @@ def __init__(
1213
valid_dataloader: DataLoader,
1314
criterion: torch.nn,
1415
optimizer: torch.optim,
16+
lr_schedular: bool = None,
1517
) -> None:
1618
self.model = model
1719
self.epochs = epochs
@@ -21,21 +23,29 @@ def __init__(
2123
self.valid_dataloader = valid_dataloader
2224
self.criterion = criterion
2325
self.optimizer = optimizer
26+
self.lr_schedular = lr_schedular
2427

2528
def train(self, run_name):
2629
print(torchinfo.summary(self.model))
27-
wandb.init(project=PROJECT_NAME, entity=run_name)
30+
wandb.init(project=PROJECT_NAME, name=run_name, config=self.config)
2831
wandb.watch(self.model, log="all")
2932
iterator = tqdm(range(self.epochs))
3033
for _ in iterator:
34+
torch.cuda.empty_cache()
3135
for i, (X, y) in enumerate(self.train_dataloader):
36+
torch.cuda.empty_cache()
37+
X = torch.tensor(X).to("cuda").view(1, -1)
38+
y = torch.tensor(y).to("cuda")
39+
print(X.shape, y.shape)
3240
self.optimizer.zero_grad()
3341
loss = self.criterion(self.model(X), y)
3442
loss.backward()
35-
self.optimizer.step(f"{i}/{len(self.train_dataloader)}")
36-
iterator.set_description()
43+
self.optimizer.step()
44+
iterator.set_description(f"{i}/{len(self.train_dataloader)}")
3745
if self.lr_schedular:
3846
self.lr_schedular.step()
47+
iterator.set_description(f"Testing...")
48+
self.model.eval()
3949
wandb.log(
4050
Test(
4151
self.test_dataloader, self.valid_dataloader, self.criterion, self.model, "Test"
@@ -50,5 +60,7 @@ def train(self, run_name):
5060
"Train",
5161
).test()
5262
)
63+
iterator.set_description(f"Testing Done")
64+
self.model.train()
5365
wandb.save()
5466
wandb.finish()

ML/helper_functions/transformer.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ def __init__(
77
padding_idx: int = 1,
88
beg_idx: int = 0,
99
end_idx: int = 2,
10-
max_seq_len: int = 256,
11-
vocab_path: str = r"https://download.pytorch.org/models/text/xlmr.vocab.pt",
12-
spm_model_path: str = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model",
10+
max_seq_len: int = 256 - 2,
11+
vocab_path=r"https://download.pytorch.org/models/text/xlmr.vocab.pt",
12+
spm_model_path=r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model",
1313
tokenizer: torchtext.transforms = SentencePieceTokenizer,
1414
vocab_transform: torchtext.transforms = VocabTransform,
1515
truncate: torchtext.transforms = Truncate,
@@ -24,13 +24,13 @@ def __init__(
2424
self.vocab_transform = vocab_transform
2525
self.truncate = truncate
2626

27-
def transform(self):
28-
t = torchtext.transforms.Compose(
29-
self.tokenizer(self.vocab_path),
30-
self.vocab_transform(self.spm_model_path),
27+
def transform(self) -> torchtext.transforms.Sequential:
28+
t = torchtext.transforms.Sequential(
29+
self.tokenizer(self.spm_model_path),
30+
self.vocab_transform(load_state_dict_from_url(self.vocab_path)),
3131
self.truncate(self.max_seq_len),
3232
AddToken(self.beg_idx, begin=True),
33-
AddToken(self.end_idx, end=True),
33+
AddToken(self.end_idx, begin=False),
3434
)
3535
return t
3636

ML/modelling/tt.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@ def __init__(
99
classifier_head: torchtext.models = RobertaClassificationHead,
1010
model: torchtext.models = XLMR_BASE_ENCODER,
1111
) -> None:
12+
super().__init__()
1213
self.num_classes = num_classes
1314
self.input_dim = input_dim
1415
self.classifier_head = classifier_head(num_classes, input_dim)
15-
self.model = model(self.classifier_head).to(device)
16+
self.model = model.get_model(head=self.classifier_head).to(device)
1617

1718
def forward(self, X):
1819
return self.model(X)

0 commit comments

Comments
 (0)