Skip to content

Commit 8bdbca7

Browse files
rtqichenapaszke
authored andcommitted
add mnist
1 parent 13a0493 commit 8bdbca7

File tree

2 files changed

+157
-1
lines changed

2 files changed

+157
-1
lines changed

torchvision/datasets/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
from .folder import ImageFolder
33
from .coco import CocoCaptions, CocoDetection
44
from .cifar import CIFAR10, CIFAR100
5+
from .mnist import MNIST
56

67
__all__ = ('LSUN', 'LSUNClass',
78
'ImageFolder',
89
'CocoCaptions', 'CocoDetection',
9-
'CIFAR10', 'CIFAR100')
10+
'CIFAR10', 'CIFAR100',
11+
'MNIST')

torchvision/datasets/mnist.py

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
from __future__ import print_function
2+
import torch.utils.data as data
3+
from PIL import Image
4+
import os
5+
import os.path
6+
import errno
7+
import torch
8+
import json
9+
import codecs
10+
import numpy as np
11+
12+
class MNIST(data.Dataset):
13+
urls = [
14+
'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
15+
'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
16+
'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
17+
'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz',
18+
]
19+
raw_folder = 'raw'
20+
processed_folder = 'processed'
21+
training_file = 'training.pt'
22+
test_file = 'test.pt'
23+
24+
def __init__(self, root, train=True, transform=None, target_transform=None, download=False):
25+
self.root = root
26+
self.transform = transform
27+
self.target_transform = target_transform
28+
self.train = train # training set or test set
29+
30+
if download:
31+
self.download()
32+
33+
if not self._check_exists():
34+
raise RuntimeError('Dataset not found.'
35+
+ ' You can use download=True to download it')
36+
37+
if self.train:
38+
self.train_data, self.train_labels = torch.load(os.path.join(root, self.processed_folder, self.training_file))
39+
else:
40+
self.test_data, self.test_labels = torch.load(os.path.join(root, self.processed_folder, self.test_file))
41+
42+
def __getitem__(self, index):
43+
if self.train:
44+
img, target = self.train_data[index], self.train_labels[index]
45+
else:
46+
img, target = self.test_data[index], self.test_labels[index]
47+
48+
# doing this so that it is consistent with all other datasets
49+
# to return a PIL Image
50+
img = Image.fromarray(img.numpy(), mode='L')
51+
52+
if self.transform is not None:
53+
img = self.transform(img)
54+
55+
if self.target_transform is not None:
56+
target = self.target_transform(target)
57+
58+
return img, target
59+
60+
def __len__(self):
61+
if self.train:
62+
return 60000
63+
else:
64+
return 10000
65+
66+
def _check_exists(self):
67+
return os.path.exists(os.path.join(self.root, self.processed_folder, self.training_file)) and \
68+
os.path.exists(os.path.join(self.root, self.processed_folder, self.test_file))
69+
70+
def download(self):
71+
from six.moves import urllib
72+
import gzip
73+
74+
if self._check_exists():
75+
print('Files already downloaded')
76+
return
77+
78+
# download files
79+
try:
80+
os.makedirs(os.path.join(self.root, self.raw_folder))
81+
os.makedirs(os.path.join(self.root, self.processed_folder))
82+
except OSError as e:
83+
if e.errno == errno.EEXIST:
84+
pass
85+
else:
86+
raise
87+
88+
for url in self.urls:
89+
print('Downloading ' + url)
90+
data = urllib.request.urlopen(url)
91+
filename = url.rpartition('/')[2]
92+
file_path = os.path.join(self.root, self.raw_folder, filename)
93+
with open(file_path, 'wb') as f:
94+
f.write(data.read())
95+
with open(file_path.replace('.gz', ''), 'wb') as out_f, \
96+
gzip.GzipFile(file_path) as zip_f:
97+
out_f.write(zip_f.read())
98+
os.unlink(file_path)
99+
100+
# process and save as torch files
101+
print('Processing')
102+
103+
training_set = (
104+
read_image_file(os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte')),
105+
read_label_file(os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte'))
106+
)
107+
test_set = (
108+
read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte')),
109+
read_label_file(os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte'))
110+
)
111+
with open(os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f:
112+
torch.save(training_set, f)
113+
with open(os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f:
114+
torch.save(test_set, f)
115+
116+
print('Done!')
117+
118+
def get_int(b):
119+
return int(codecs.encode(b, 'hex'), 16)
120+
121+
def parse_byte(b):
122+
if isinstance(b, str):
123+
return ord(b)
124+
return b
125+
126+
def read_label_file(path):
127+
with open(path, 'rb') as f:
128+
data = f.read()
129+
assert get_int(data[:4]) == 2049
130+
length = get_int(data[4:8])
131+
labels = [parse_byte(b) for b in data[8:]]
132+
assert len(labels) == length
133+
return torch.LongTensor(labels)
134+
135+
def read_image_file(path):
136+
with open(path, 'rb') as f:
137+
data = f.read()
138+
assert get_int(data[:4]) == 2051
139+
length = get_int(data[4:8])
140+
num_rows = get_int(data[8:12])
141+
num_cols = get_int(data[12:16])
142+
images = []
143+
idx = 16
144+
for l in range(length):
145+
img = []
146+
images.append(img)
147+
for r in range(num_rows):
148+
row = []
149+
img.append(row)
150+
for c in range(num_cols):
151+
row.append(parse_byte(data[idx]))
152+
idx += 1
153+
assert len(images) == length
154+
return torch.ByteTensor(images).view(-1, 28, 28)

0 commit comments

Comments
 (0)