Skip to content

Commit cd607c3

Browse files
updated basic tutorials, better comments, code revision, checked it works with latest pytorch version
1 parent 3f53d68 commit cd607c3

File tree

14 files changed

+162
-88
lines changed

14 files changed

+162
-88
lines changed

ML/Pytorch/Basics/Imbalanced_classes/main.py

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,21 @@
1+
"""
2+
This code is for dealing with imbalanced datasets in PyTorch. Imbalanced datasets are those where the number of samples in one or more classes is significantly lower than the number of samples in the other classes. This can be a problem because it can lead to a model that is biased towards the more common classes, which can result in poor performance on the less common classes.
3+
4+
To deal with imbalanced datasets, this code implements two methods: oversampling and class weighting.
5+
6+
Oversampling involves generating additional samples for the underrepresented classes, while class weighting involves assigning higher weights to the loss of samples in the underrepresented classes, so that the model pays more attention to them.
7+
8+
In this code, the get_loader function takes a root directory for a dataset and a batch size, and returns a PyTorch data loader. The data loader is used to iterate over the dataset in batches. The get_loader function first applies some transformations to the images in the dataset using the transforms module from torchvision. Then it calculates the class weights based on the number of samples in each class. It then creates a WeightedRandomSampler object, which is used to randomly select a batch of samples with a probability proportional to their weights. Finally, it creates the data loader using the dataset and the weighted random sampler.
9+
10+
The main function then uses the data loader to iterate over the dataset for 10 epochs, and counts the number of samples in each class. Finally, it prints the counts for each class.
11+
12+
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
13+
* 2020-04-08: Initial coding
14+
* 2021-03-24: Added more detailed comments also removed part of
15+
check_accuracy which would only work specifically on MNIST.
16+
* 2022-12-19: Updated detailed comments, small code revision, checked code still works with latest PyTorch.
17+
"""
18+
119
import torch
220
import torchvision.datasets as datasets
321
import os
@@ -6,9 +24,10 @@
624
import torch.nn as nn
725

826
# Methods for dealing with imbalanced datasets:
9-
# 1. Oversampling
27+
# 1. Oversampling (probably preferable)
1028
# 2. Class weighting
1129

30+
1231
def get_loader(root_dir, batch_size):
1332
my_transforms = transforms.Compose(
1433
[
@@ -18,19 +37,24 @@ def get_loader(root_dir, batch_size):
1837
)
1938

2039
dataset = datasets.ImageFolder(root=root_dir, transform=my_transforms)
40+
subdirectories = dataset.classes
2141
class_weights = []
22-
for root, subdir, files in os.walk(root_dir):
23-
if len(files) > 0:
24-
class_weights.append(1/len(files))
42+
43+
# loop through each subdirectory and calculate the class weight
44+
# that is 1 / len(files) in that subdirectory
45+
for subdir in subdirectories:
46+
files = os.listdir(os.path.join(root_dir, subdir))
47+
class_weights.append(1 / len(files))
2548

2649
sample_weights = [0] * len(dataset)
2750

2851
for idx, (data, label) in enumerate(dataset):
2952
class_weight = class_weights[label]
3053
sample_weights[idx] = class_weight
3154

32-
sampler = WeightedRandomSampler(sample_weights, num_samples=
33-
len(sample_weights), replacement=True)
55+
sampler = WeightedRandomSampler(
56+
sample_weights, num_samples=len(sample_weights), replacement=True
57+
)
3458

3559
loader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)
3660
return loader
@@ -43,12 +67,12 @@ def main():
4367
num_elkhounds = 0
4468
for epoch in range(10):
4569
for data, labels in loader:
46-
num_retrievers += torch.sum(labels==0)
47-
num_elkhounds += torch.sum(labels==1)
70+
num_retrievers += torch.sum(labels == 0)
71+
num_elkhounds += torch.sum(labels == 1)
72+
73+
print(num_retrievers.item())
74+
print(num_elkhounds.item())
4875

49-
print(num_retrievers)
50-
print(num_elkhounds)
5176

5277
if __name__ == "__main__":
5378
main()
54-

ML/Pytorch/Basics/albumentations_tutorial/classification.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import numpy as np
44
from utils import plot_examples
55
from PIL import Image
6+
from tqdm import tqdm
67

78
image = Image.open("images/elon.jpeg")
89

@@ -14,18 +15,20 @@
1415
A.HorizontalFlip(p=0.5),
1516
A.VerticalFlip(p=0.1),
1617
A.RGBShift(r_shift_limit=25, g_shift_limit=25, b_shift_limit=25, p=0.9),
17-
A.OneOf([
18-
A.Blur(blur_limit=3, p=0.5),
19-
A.ColorJitter(p=0.5),
20-
], p=1.0),
18+
A.OneOf(
19+
[
20+
A.Blur(blur_limit=3, p=0.5),
21+
A.ColorJitter(p=0.5),
22+
],
23+
p=1.0,
24+
),
2125
]
2226
)
2327

2428
images_list = [image]
2529
image = np.array(image)
26-
for i in range(15):
30+
for i in tqdm(range(15)):
2731
augmentations = transform(image=image)
2832
augmented_img = augmentations["image"]
2933
images_list.append(augmented_img)
3034
plot_examples(images_list)
31-

ML/Pytorch/Basics/albumentations_tutorial/full_pytorch_example.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from torch.utils.data import Dataset
99
import os
1010

11+
1112
class ImageFolder(Dataset):
1213
def __init__(self, root_dir, transform=None):
1314
super(ImageFolder, self).__init__()
@@ -18,7 +19,7 @@ def __init__(self, root_dir, transform=None):
1819

1920
for index, name in enumerate(self.class_names):
2021
files = os.listdir(os.path.join(root_dir, name))
21-
self.data += list(zip(files, [index]*len(files)))
22+
self.data += list(zip(files, [index] * len(files)))
2223

2324
def __len__(self):
2425
return len(self.data)
@@ -43,10 +44,13 @@ def __getitem__(self, index):
4344
A.HorizontalFlip(p=0.5),
4445
A.VerticalFlip(p=0.1),
4546
A.RGBShift(r_shift_limit=25, g_shift_limit=25, b_shift_limit=25, p=0.9),
46-
A.OneOf([
47-
A.Blur(blur_limit=3, p=0.5),
48-
A.ColorJitter(p=0.5),
49-
], p=1.0),
47+
A.OneOf(
48+
[
49+
A.Blur(blur_limit=3, p=0.5),
50+
A.ColorJitter(p=0.5),
51+
],
52+
p=1.0,
53+
),
5054
A.Normalize(
5155
mean=[0, 0, 0],
5256
std=[1, 1, 1],
@@ -58,5 +62,5 @@ def __getitem__(self, index):
5862

5963
dataset = ImageFolder(root_dir="cat_dogs", transform=transform)
6064

61-
for x,y in dataset:
65+
for x, y in dataset:
6266
print(x.shape)

ML/Pytorch/Basics/albumentations_tutorial/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
def visualize(image):
1010
plt.figure(figsize=(10, 10))
11-
plt.axis('off')
11+
plt.axis("off")
1212
plt.imshow(image)
1313
plt.show()
1414

@@ -22,7 +22,7 @@ def plot_examples(images, bboxes=None):
2222
if bboxes is not None:
2323
img = visualize_bbox(images[i - 1], bboxes[i - 1], class_name="Elon")
2424
else:
25-
img = images[i-1]
25+
img = images[i - 1]
2626
fig.add_subplot(rows, columns, i)
2727
plt.imshow(img)
2828
plt.show()
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/sh
2+
3+
wget https://www.kaggle.com/datasets/e1cd22253a9b23b073794872bf565648ddbe4f17e7fa9e74766ad3707141adeb/download?datasetVersionNumber=1

ML/Pytorch/Basics/custom_dataset_txt/loader_customtext.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1+
"""
2+
Introductory tutorial on how to deal with custom text datasets in PyTorch.
3+
Note that there are better ways to do this when dealing with huge text datasets.
4+
But this is a good way of understanding how it works and can be used as a starting
5+
point, particularly for smaller/medium datasets.
6+
7+
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
8+
* 2020-04-09 Initial coding
9+
* 2022-12-19 Updated comments, minor code revision, and checked code still works with latest PyTorch.
10+
"""
11+
12+
113
import os # when loading file paths
214
import pandas as pd # for lookup in annotation file
315
import spacy # for tokenizer
@@ -15,8 +27,8 @@
1527
# of same seq_len and setup dataloader)
1628
# Note that loading the image is very easy compared to the text!
1729

18-
# Download with: python -m spacy download en
19-
spacy_eng = spacy.load("en")
30+
# Download with: python -m spacy download en_core_web_sm
31+
spacy_eng = spacy.load("en_core_web_sm")
2032

2133

2234
class Vocabulary:
@@ -130,7 +142,10 @@ def get_loader(
130142

131143
if __name__ == "__main__":
132144
transform = transforms.Compose(
133-
[transforms.Resize((224, 224)), transforms.ToTensor(),]
145+
[
146+
transforms.Resize((224, 224)),
147+
transforms.ToTensor(),
148+
]
134149
)
135150

136151
loader, dataset = get_loader(

ML/Pytorch/Basics/pytorch_progress_bar.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
"""
2+
Example code of how to set progress bar using tqdm that is very efficient and nicely looking.
3+
4+
Programmed by Aladdin Persson <aladdin.persson at hotmail dot com>
5+
* 2020-05-09 Initial coding
6+
* 2022-12-19 Updated with more detailed comments, and checked code works with latest PyTorch.
7+
8+
"""
9+
110
import torch
211
import torch.nn as nn
312
from tqdm import tqdm

ML/Pytorch/Basics/pytorch_simple_CNN.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,20 @@
88
Programmed by Aladdin Persson
99
* 2020-04-08: Initial coding
1010
* 2021-03-24: More detailed comments and small revision of the code
11+
* 2022-12-19: Small revision of code, checked that it works with latest PyTorch version
1112
1213
"""
1314

1415
# Imports
1516
import torch
16-
import torchvision # torch package for vision related things
1717
import torch.nn.functional as F # Parameterless functions, like (some) activation functions
1818
import torchvision.datasets as datasets # Standard datasets
1919
import torchvision.transforms as transforms # Transformations we can perform on our dataset for augmentation
2020
from torch import optim # For optimizers like SGD, Adam, etc.
2121
from torch import nn # All neural network modules
22-
from torch.utils.data import DataLoader # Gives easier dataset managment by creating mini batches etc.
22+
from torch.utils.data import (
23+
DataLoader,
24+
) # Gives easier dataset managment by creating mini batches etc.
2325
from tqdm import tqdm # For nice progress bar!
2426

2527
# Simple CNN
@@ -29,17 +31,17 @@ def __init__(self, in_channels=1, num_classes=10):
2931
self.conv1 = nn.Conv2d(
3032
in_channels=in_channels,
3133
out_channels=8,
32-
kernel_size=(3, 3),
33-
stride=(1, 1),
34-
padding=(1, 1),
34+
kernel_size=3,
35+
stride=1,
36+
padding=1,
3537
)
36-
self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))
38+
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
3739
self.conv2 = nn.Conv2d(
3840
in_channels=8,
3941
out_channels=16,
40-
kernel_size=(3, 3),
41-
stride=(1, 1),
42-
padding=(1, 1),
42+
kernel_size=3,
43+
stride=1,
44+
padding=1,
4345
)
4446
self.fc1 = nn.Linear(16 * 7 * 7, num_classes)
4547

@@ -59,13 +61,17 @@ def forward(self, x):
5961
# Hyperparameters
6062
in_channels = 1
6163
num_classes = 10
62-
learning_rate = 0.001
64+
learning_rate = 3e-4 # karpathy's constant
6365
batch_size = 64
6466
num_epochs = 3
6567

6668
# Load Data
67-
train_dataset = datasets.MNIST(root="dataset/", train=True, transform=transforms.ToTensor(), download=True)
68-
test_dataset = datasets.MNIST(root="dataset/", train=False, transform=transforms.ToTensor(), download=True)
69+
train_dataset = datasets.MNIST(
70+
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
71+
)
72+
test_dataset = datasets.MNIST(
73+
root="dataset/", train=False, transform=transforms.ToTensor(), download=True
74+
)
6975
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
7076
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
7177

@@ -110,10 +116,9 @@ def check_accuracy(loader, model):
110116
num_correct += (predictions == y).sum()
111117
num_samples += predictions.size(0)
112118

113-
114119
model.train()
115-
return num_correct/num_samples
120+
return num_correct / num_samples
116121

117122

118123
print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
119-
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")
124+
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")

ML/Pytorch/Basics/pytorch_simple_fullynet.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,14 @@
1414

1515
# Imports
1616
import torch
17-
import torchvision # torch package for vision related things
1817
import torch.nn.functional as F # Parameterless functions, like (some) activation functions
1918
import torchvision.datasets as datasets # Standard datasets
2019
import torchvision.transforms as transforms # Transformations we can perform on our dataset for augmentation
2120
from torch import optim # For optimizers like SGD, Adam, etc.
2221
from torch import nn # All neural network modules
23-
from torch.utils.data import DataLoader # Gives easier dataset managment by creating mini batches etc.
22+
from torch.utils.data import (
23+
DataLoader,
24+
) # Gives easier dataset managment by creating mini batches etc.
2425
from tqdm import tqdm # For nice progress bar!
2526

2627
# Here we create our simple neural network. For more details here we are subclassing and
@@ -37,8 +38,6 @@ def __init__(self, input_size, num_classes):
3738
input_size: the size of the input, in this case 784 (28x28)
3839
num_classes: the number of classes we want to predict, in this case 10 (0-9)
3940
40-
Returns:
41-
None
4241
"""
4342
super(NN, self).__init__()
4443
# Our first linear layer take input_size, in this case 784 nodes to 50
@@ -76,8 +75,12 @@ def forward(self, x):
7675
num_epochs = 3
7776

7877
# Load Data
79-
train_dataset = datasets.MNIST(root="dataset/", train=True, transform=transforms.ToTensor(), download=True)
80-
test_dataset = datasets.MNIST(root="dataset/", train=False, transform=transforms.ToTensor(), download=True)
78+
train_dataset = datasets.MNIST(
79+
root="dataset/", train=True, transform=transforms.ToTensor(), download=True
80+
)
81+
test_dataset = datasets.MNIST(
82+
root="dataset/", train=False, transform=transforms.ToTensor(), download=True
83+
)
8184
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
8285
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)
8386

@@ -153,8 +156,9 @@ def check_accuracy(loader, model):
153156
num_samples += predictions.size(0)
154157

155158
model.train()
156-
return num_correct/num_samples
159+
return num_correct / num_samples
160+
157161

158162
# Check accuracy on training & test to see how good our model
159163
print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
160-
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")
164+
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")

0 commit comments

Comments
 (0)