Skip to content

Commit 2db438b

Browse files
committed
Automated tutorials push
1 parent 26ae677 commit 2db438b

File tree

192 files changed

+14592
-14124
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

192 files changed

+14592
-14124
lines changed

_downloads/13b143c2380f4768d9432d808ad50799/char_rnn_classification_tutorial.ipynb

Lines changed: 32 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@
9797
},
9898
"outputs": [],
9999
"source": [
100-
"import torch \n",
100+
"import torch\n",
101101
"\n",
102102
"# Check if CUDA is available\n",
103103
"device = torch.device('cpu')\n",
@@ -138,13 +138,14 @@
138138
},
139139
"outputs": [],
140140
"source": [
141-
"import string \n",
141+
"import string\n",
142142
"import unicodedata\n",
143143
"\n",
144-
"allowed_characters = string.ascii_letters + \" .,;'\"\n",
145-
"n_letters = len(allowed_characters) \n",
144+
"# We can use \"_\" to represent an out-of-vocabulary character, that is, any character we are not handling in our model\n",
145+
"allowed_characters = string.ascii_letters + \" .,;'\" + \"_\"\n",
146+
"n_letters = len(allowed_characters)\n",
146147
"\n",
147-
"# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427 \n",
148+
"# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427\n",
148149
"def unicodeToAscii(s):\n",
149150
" return ''.join(\n",
150151
" c for c in unicodedata.normalize('NFD', s)\n",
@@ -203,7 +204,11 @@
203204
"source": [
204205
"# Find letter index from all_letters, e.g. \"a\" = 0\n",
205206
"def letterToIndex(letter):\n",
206-
" return allowed_characters.find(letter)\n",
207+
" # return our out-of-vocabulary character if we encounter a letter unknown to our model\n",
208+
" if letter not in allowed_characters:\n",
209+
" return allowed_characters.find(\"_\")\n",
210+
" else:\n",
211+
" return allowed_characters.find(letter)\n",
207212
"\n",
208213
"# Turn a line into a <line_length x 1 x n_letters>,\n",
209214
"# or an array of one-hot letter vectors\n",
@@ -261,7 +266,7 @@
261266
"from io import open\n",
262267
"import glob\n",
263268
"import os\n",
264-
"import time \n",
269+
"import time\n",
265270
"\n",
266271
"import torch\n",
267272
"from torch.utils.data import Dataset\n",
@@ -270,26 +275,26 @@
270275
"\n",
271276
" def __init__(self, data_dir):\n",
272277
" self.data_dir = data_dir #for provenance of the dataset\n",
273-
" self.load_time = time.localtime #for provenance of the dataset \n",
278+
" self.load_time = time.localtime #for provenance of the dataset\n",
274279
" labels_set = set() #set of all classes\n",
275280
"\n",
276281
" self.data = []\n",
277282
" self.data_tensors = []\n",
278-
" self.labels = [] \n",
279-
" self.labels_tensors = [] \n",
283+
" self.labels = []\n",
284+
" self.labels_tensors = []\n",
280285
"\n",
281286
" #read all the ``.txt`` files in the specified directory\n",
282-
" text_files = glob.glob(os.path.join(data_dir, '*.txt')) \n",
287+
" text_files = glob.glob(os.path.join(data_dir, '*.txt'))\n",
283288
" for filename in text_files:\n",
284289
" label = os.path.splitext(os.path.basename(filename))[0]\n",
285290
" labels_set.add(label)\n",
286291
" lines = open(filename, encoding='utf-8').read().strip().split('\\n')\n",
287-
" for name in lines: \n",
292+
" for name in lines:\n",
288293
" self.data.append(name)\n",
289294
" self.data_tensors.append(lineToTensor(name))\n",
290295
" self.labels.append(label)\n",
291296
"\n",
292-
" #Cache the tensor representation of the labels \n",
297+
" #Cache the tensor representation of the labels\n",
293298
" self.labels_uniq = list(labels_set)\n",
294299
" for idx in range(len(self.labels)):\n",
295300
" temp_tensor = torch.tensor([self.labels_uniq.index(self.labels[idx])], dtype=torch.long)\n",
@@ -302,7 +307,7 @@
302307
" data_item = self.data[idx]\n",
303308
" data_label = self.labels[idx]\n",
304309
" data_tensor = self.data_tensors[idx]\n",
305-
" label_tensor = self.labels_tensors[idx] \n",
310+
" label_tensor = self.labels_tensors[idx]\n",
306311
"\n",
307312
" return label_tensor, data_tensor, data_label, data_item"
308313
]
@@ -402,7 +407,7 @@
402407
" self.rnn = nn.RNN(input_size, hidden_size)\n",
403408
" self.h2o = nn.Linear(hidden_size, output_size)\n",
404409
" self.softmax = nn.LogSoftmax(dim=1)\n",
405-
" \n",
410+
"\n",
406411
" def forward(self, line_tensor):\n",
407412
" rnn_out, hidden = self.rnn(line_tensor)\n",
408413
" output = self.h2o(hidden[0])\n",
@@ -415,7 +420,7 @@
415420
"cell_type": "markdown",
416421
"metadata": {},
417422
"source": [
418-
"We can then create an RNN with 57 input nodes, 128 hidden nodes, and 18\n",
423+
"We can then create an RNN with 58 input nodes, 128 hidden nodes, and 18\n",
419424
"outputs:\n"
420425
]
421426
},
@@ -456,7 +461,7 @@
456461
"\n",
457462
"input = lineToTensor('Albert')\n",
458463
"output = rnn(input) #this is equivalent to ``output = rnn.forward(input)``\n",
459-
"print(output) \n",
464+
"print(output)\n",
460465
"print(label_from_output(output, alldata.labels_uniq))"
461466
]
462467
},
@@ -494,8 +499,8 @@
494499
},
495500
"outputs": [],
496501
"source": [
497-
"import random \n",
498-
"import numpy as np \n",
502+
"import random\n",
503+
"import numpy as np\n",
499504
"\n",
500505
"def train(rnn, training_data, n_epoch = 10, n_batch_size = 64, report_every = 50, learning_rate = 0.2, criterion = nn.NLLLoss()):\n",
501506
" \"\"\"\n",
@@ -504,22 +509,22 @@
504509
" # Keep track of losses for plotting\n",
505510
" current_loss = 0\n",
506511
" all_losses = []\n",
507-
" rnn.train() \n",
512+
" rnn.train()\n",
508513
" optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)\n",
509514
"\n",
510515
" start = time.time()\n",
511516
" print(f\"training on data set with n = {len(training_data)}\")\n",
512517
"\n",
513-
" for iter in range(1, n_epoch + 1): \n",
514-
" rnn.zero_grad() # clear the gradients \n",
518+
" for iter in range(1, n_epoch + 1):\n",
519+
" rnn.zero_grad() # clear the gradients\n",
515520
"\n",
516521
" # create some minibatches\n",
517522
" # we cannot use dataloaders because each of our names is a different length\n",
518523
" batches = list(range(len(training_data)))\n",
519524
" random.shuffle(batches)\n",
520525
" batches = np.array_split(batches, len(batches) //n_batch_size )\n",
521526
"\n",
522-
" for idx, batch in enumerate(batches): \n",
527+
" for idx, batch in enumerate(batches):\n",
523528
" batch_loss = 0\n",
524529
" for i in batch: #for each example in this batch\n",
525530
" (label_tensor, text_tensor, label, text) = training_data[i]\n",
@@ -534,12 +539,12 @@
534539
" optimizer.zero_grad()\n",
535540
"\n",
536541
" current_loss += batch_loss.item() / len(batch)\n",
537-
" \n",
542+
"\n",
538543
" all_losses.append(current_loss / len(batches) )\n",
539544
" if iter % report_every == 0:\n",
540545
" print(f\"{iter} ({iter / n_epoch:.0%}): \\t average batch loss = {all_losses[-1]}\")\n",
541546
" current_loss = 0\n",
542-
" \n",
547+
"\n",
543548
" return all_losses"
544549
]
545550
},
@@ -617,12 +622,12 @@
617622
"source": [
618623
"def evaluate(rnn, testing_data, classes):\n",
619624
" confusion = torch.zeros(len(classes), len(classes))\n",
620-
" \n",
625+
"\n",
621626
" rnn.eval() #set to eval mode\n",
622627
" with torch.no_grad(): # do not record the gradients during eval phase\n",
623628
" for i in range(len(testing_data)):\n",
624629
" (label_tensor, text_tensor, label, text) = testing_data[i]\n",
625-
" output = rnn(text_tensor) \n",
630+
" output = rnn(text_tensor)\n",
626631
" guess, guess_i = label_from_output(output, classes)\n",
627632
" label_i = classes.index(label)\n",
628633
" confusion[label_i][guess_i] += 1\n",

0 commit comments

Comments
 (0)