PyTorchKorea
diff --git a/‎docs/_downloads/462f53ac0f7c6840743ad8655c43102c/torchtext_translation.py‎
Lines changed: 5 additions & 5 deletions b/‎docs/_downloads/462f53ac0f7c6840743ad8655c43102c/torchtext_translation.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎docs/_downloads/6fbbb25a2ddfe5bf93b618f53cf7077e/torchtext_translation.ipynb‎
Lines changed: 2 additions & 2 deletions b/‎docs/_downloads/6fbbb25a2ddfe5bf93b618f53cf7077e/torchtext_translation.ipynb‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/_downloads/d9398fce39ca80dc4bb8b8ea55b575a8/nn_tutorial.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎docs/_downloads/d9398fce39ca80dc4bb8b8ea55b575a8/nn_tutorial.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/_downloads/f16255c783f9e487235b8eff6c8792b9/nn_tutorial.py‎
Lines changed: 1 addition & 1 deletion b/‎docs/_downloads/f16255c783f9e487235b8eff6c8792b9/nn_tutorial.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/_images/sphx_glr_char_rnn_classification_tutorial_001.png‎
-277 Bytes b/‎docs/_images/sphx_glr_char_rnn_classification_tutorial_001.png‎
-277 Bytes
diff --git a/‎docs/_images/sphx_glr_char_rnn_classification_tutorial_002.png‎
-135 Bytes b/‎docs/_images/sphx_glr_char_rnn_classification_tutorial_002.png‎
-135 Bytes
diff --git a/‎docs/_images/sphx_glr_char_rnn_generation_tutorial_001.png‎
-15 Bytes b/‎docs/_images/sphx_glr_char_rnn_generation_tutorial_001.png‎
-15 Bytes
diff --git a/‎docs/_images/sphx_glr_cifar10_tutorial_001.png‎
-699 Bytes b/‎docs/_images/sphx_glr_cifar10_tutorial_001.png‎
-699 Bytes
diff --git a/‎docs/_images/sphx_glr_cifar10_tutorial_thumb.png‎
-2.75 KB b/‎docs/_images/sphx_glr_cifar10_tutorial_thumb.png‎
-2.75 KB
diff --git a/‎docs/_images/sphx_glr_data_loading_tutorial_003.png‎
-350 Bytes b/‎docs/_images/sphx_glr_data_loading_tutorial_003.png‎
-350 Bytes
@@ -36,14 +36,14 @@
 #
 # ::
 #
-#    python -m spacy download en
-#    python -m spacy download de
+#    python -m spacy download en_core_web_sm
+#    python -m spacy download de_core_news_sm
 
 import torchtext
 import torch
 from torchtext.data.utils import get_tokenizer
 from collections import Counter
-from torchtext.vocab import Vocab, vocab
+from torchtext.vocab import vocab
 from torchtext.utils import download_from_url, extract_archive
 import io
 
@@ -56,8 +56,8 @@
 val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
 test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]
 
-de_tokenizer = get_tokenizer('spacy', language='de')
-en_tokenizer = get_tokenizer('spacy', language='en')
+de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
+en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
 
 def build_vocab(filepath, tokenizer):
   counter = Counter()
 
@@ -22,7 +22,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## \ub370\uc774\ud130 \ucc98\ub9ac\ud558\uae30\n\n``torchtext`` \uc5d0\ub294 \uc5b8\uc5b4 \ubcc0\ud658 \ubaa8\ub378\uc744 \ub9cc\ub4e4 \ub54c \uc27d\uac8c \uc0ac\uc6a9\ud560 \uc218 \uc788\ub294 \ub370\uc774\ud130\uc14b\uc744 \ub9cc\ub4e4\uae30 \uc801\ud569\ud55c \ub2e4\uc591\ud55c \ub3c4\uad6c\uac00 \uc788\uc2b5\ub2c8\ub2e4.\n\uc774 \uc608\uc81c\uc5d0\uc11c\ub294 \uac00\uacf5\ub418\uc9c0 \uc54a\uc740 \ud14d\uc2a4\ud2b8 \ubb38\uc7a5(raw text sentence)\uc744 \ud1a0\ud070\ud654(tokenize)\ud558\uace0, \uc5b4\ud718\uc9d1(vocabulary)\uc744 \ub9cc\ub4e4\uace0,\n\ud1a0\ud070\uc744 \ud150\uc11c\ub85c \uc22b\uc790\ud654(numericalize)\ud558\ub294 \ubc29\ubc95\uc744 \uc54c\uc544\ubcf4\uaca0\uc2b5\ub2c8\ub2e4.\n\n\ucc38\uace0 : \uc774 \ud29c\ud1a0\ub9ac\uc5bc\uc5d0\uc11c\uc758 \ud1a0\ud070\ud654(tokenization)\uc5d0\ub294 [Spacy](https://spacy.io)_ \uac00 \ud544\uc694\ud569\ub2c8\ub2e4.\nSpacy\ub294 \uc601\uc5b4 \uc774 \uc678\uc758 \ub2e4\ub978 \uc5b8\uc5b4\uc5d0 \ub300\ud55c \uac15\ub825\ud55c \ud1a0\ud070\ud654 \uae30\ub2a5\uc744 \uc81c\uacf5\ud558\uae30 \ub54c\ubb38\uc5d0 \uc0ac\uc6a9\ud569\ub2c8\ub2e4. ``torchtext`` \ub294\n`basic_english`` \ud1a0\ud06c\ub098\uc774\uc800\ub97c \uc81c\uacf5\ud560 \ubfd0 \uc544\ub2c8\ub77c \uc601\uc5b4\uc5d0 \uc0ac\uc6a9\ud560 \uc218 \uc788\ub294 \ub2e4\ub978 \ud1a0\ud06c\ub098\uc774\uc800\ub4e4(\uc608\ucee8\ub370\n[Moses](https://bitbucket.org/luismsgomes/mosestokenizer/src/default/)_ )\uc744 \uc9c0\uc6d0\ud569\ub2c8\ub2e4\ub9cc, \uc5b8\uc5b4 \ubc88\uc5ed\uc744 \uc704\ud574\uc11c\ub294 \ub2e4\uc591\ud55c \uc5b8\uc5b4\ub97c\n\ub2e4\ub8e8\uc5b4\uc57c \ud558\uae30 \ub54c\ubb38\uc5d0 Spacy\uac00 \uac00\uc7a5 \uc801\ud569\ud569\ub2c8\ub2e4.\n\n\uc774 \ud29c\ud1a0\ub9ac\uc5bc\uc744 \uc2e4\ud589\ud558\ub824\uba74, \uc6b0\uc120 ``pip`` \ub098 ``conda`` \ub85c ``spacy`` \ub97c \uc124\uce58\ud558\uc138\uc694. \uadf8 \ub2e4\uc74c,\nSpacy \ud1a0\ud06c\ub098\uc774\uc800\uac00 \uc4f8 \uc601\uc5b4\uc640 \ub3c5\uc77c\uc5b4\uc5d0 \ub300\ud55c \ub370\uc774\ud130\ub97c \ub2e4\uc6b4\ub85c\ub4dc \ubc1b\uc2b5\ub2c8\ub2e4.\n\n::\n\n   python -m spacy download en\n   python -m spacy download de\n\n"
+        "## \ub370\uc774\ud130 \ucc98\ub9ac\ud558\uae30\n\n``torchtext`` \uc5d0\ub294 \uc5b8\uc5b4 \ubcc0\ud658 \ubaa8\ub378\uc744 \ub9cc\ub4e4 \ub54c \uc27d\uac8c \uc0ac\uc6a9\ud560 \uc218 \uc788\ub294 \ub370\uc774\ud130\uc14b\uc744 \ub9cc\ub4e4\uae30 \uc801\ud569\ud55c \ub2e4\uc591\ud55c \ub3c4\uad6c\uac00 \uc788\uc2b5\ub2c8\ub2e4.\n\uc774 \uc608\uc81c\uc5d0\uc11c\ub294 \uac00\uacf5\ub418\uc9c0 \uc54a\uc740 \ud14d\uc2a4\ud2b8 \ubb38\uc7a5(raw text sentence)\uc744 \ud1a0\ud070\ud654(tokenize)\ud558\uace0, \uc5b4\ud718\uc9d1(vocabulary)\uc744 \ub9cc\ub4e4\uace0,\n\ud1a0\ud070\uc744 \ud150\uc11c\ub85c \uc22b\uc790\ud654(numericalize)\ud558\ub294 \ubc29\ubc95\uc744 \uc54c\uc544\ubcf4\uaca0\uc2b5\ub2c8\ub2e4.\n\n\ucc38\uace0 : \uc774 \ud29c\ud1a0\ub9ac\uc5bc\uc5d0\uc11c\uc758 \ud1a0\ud070\ud654(tokenization)\uc5d0\ub294 [Spacy](https://spacy.io)_ \uac00 \ud544\uc694\ud569\ub2c8\ub2e4.\nSpacy\ub294 \uc601\uc5b4 \uc774 \uc678\uc758 \ub2e4\ub978 \uc5b8\uc5b4\uc5d0 \ub300\ud55c \uac15\ub825\ud55c \ud1a0\ud070\ud654 \uae30\ub2a5\uc744 \uc81c\uacf5\ud558\uae30 \ub54c\ubb38\uc5d0 \uc0ac\uc6a9\ud569\ub2c8\ub2e4. ``torchtext`` \ub294\n`basic_english`` \ud1a0\ud06c\ub098\uc774\uc800\ub97c \uc81c\uacf5\ud560 \ubfd0 \uc544\ub2c8\ub77c \uc601\uc5b4\uc5d0 \uc0ac\uc6a9\ud560 \uc218 \uc788\ub294 \ub2e4\ub978 \ud1a0\ud06c\ub098\uc774\uc800\ub4e4(\uc608\ucee8\ub370\n[Moses](https://bitbucket.org/luismsgomes/mosestokenizer/src/default/)_ )\uc744 \uc9c0\uc6d0\ud569\ub2c8\ub2e4\ub9cc, \uc5b8\uc5b4 \ubc88\uc5ed\uc744 \uc704\ud574\uc11c\ub294 \ub2e4\uc591\ud55c \uc5b8\uc5b4\ub97c\n\ub2e4\ub8e8\uc5b4\uc57c \ud558\uae30 \ub54c\ubb38\uc5d0 Spacy\uac00 \uac00\uc7a5 \uc801\ud569\ud569\ub2c8\ub2e4.\n\n\uc774 \ud29c\ud1a0\ub9ac\uc5bc\uc744 \uc2e4\ud589\ud558\ub824\uba74, \uc6b0\uc120 ``pip`` \ub098 ``conda`` \ub85c ``spacy`` \ub97c \uc124\uce58\ud558\uc138\uc694. \uadf8 \ub2e4\uc74c,\nSpacy \ud1a0\ud06c\ub098\uc774\uc800\uac00 \uc4f8 \uc601\uc5b4\uc640 \ub3c5\uc77c\uc5b4\uc5d0 \ub300\ud55c \ub370\uc774\ud130\ub97c \ub2e4\uc6b4\ub85c\ub4dc \ubc1b\uc2b5\ub2c8\ub2e4.\n\n::\n\n   python -m spacy download en_core_web_sm\n   python -m spacy download de_core_news_sm\n\n"
       ]
     },
     {
@@ -33,7 +33,7 @@
       },
       "outputs": [],
       "source": [
-        "import torchtext\nimport torch\nfrom torchtext.data.utils import get_tokenizer\nfrom collections import Counter\nfrom torchtext.vocab import Vocab, vocab\nfrom torchtext.utils import download_from_url, extract_archive\nimport io\n\nurl_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'\ntrain_urls = ('train.de.gz', 'train.en.gz')\nval_urls = ('val.de.gz', 'val.en.gz')\ntest_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')\n\ntrain_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]\nval_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]\ntest_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]\n\nde_tokenizer = get_tokenizer('spacy', language='de')\nen_tokenizer = get_tokenizer('spacy', language='en')\n\ndef build_vocab(filepath, tokenizer):\n  counter = Counter()\n  with io.open(filepath, encoding=\"utf8\") as f:\n    for string_ in f:\n      counter.update(tokenizer(string_))\n  return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])\n\nde_vocab = build_vocab(train_filepaths[0], de_tokenizer)\nen_vocab = build_vocab(train_filepaths[1], en_tokenizer)\n\ndef data_process(filepaths):\n  raw_de_iter = iter(io.open(filepaths[0], encoding=\"utf8\"))\n  raw_en_iter = iter(io.open(filepaths[1], encoding=\"utf8\"))\n  data = []\n  for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):\n    de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],\n                            dtype=torch.long)\n    en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],\n                            dtype=torch.long)\n    data.append((de_tensor_, en_tensor_))\n  return data\n\ntrain_data = data_process(train_filepaths)\nval_data = data_process(val_filepaths)\ntest_data = data_process(test_filepaths)"
+        "import torchtext\nimport torch\nfrom torchtext.data.utils import get_tokenizer\nfrom collections import Counter\nfrom torchtext.vocab import vocab\nfrom torchtext.utils import download_from_url, extract_archive\nimport io\n\nurl_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'\ntrain_urls = ('train.de.gz', 'train.en.gz')\nval_urls = ('val.de.gz', 'val.en.gz')\ntest_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')\n\ntrain_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]\nval_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]\ntest_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]\n\nde_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')\nen_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')\n\ndef build_vocab(filepath, tokenizer):\n  counter = Counter()\n  with io.open(filepath, encoding=\"utf8\") as f:\n    for string_ in f:\n      counter.update(tokenizer(string_))\n  return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])\n\nde_vocab = build_vocab(train_filepaths[0], de_tokenizer)\nen_vocab = build_vocab(train_filepaths[1], en_tokenizer)\n\ndef data_process(filepaths):\n  raw_de_iter = iter(io.open(filepaths[0], encoding=\"utf8\"))\n  raw_en_iter = iter(io.open(filepaths[1], encoding=\"utf8\"))\n  data = []\n  for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):\n    de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],\n                            dtype=torch.long)\n    en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],\n                            dtype=torch.long)\n    data.append((de_tensor_, en_tensor_))\n  return data\n\ntrain_data = data_process(train_filepaths)\nval_data = data_process(val_filepaths)\ntest_data = data_process(test_filepaths)"
       ]
     },
     {
 
@@ -33,7 +33,7 @@
       },
       "outputs": [],
       "source": [
-        "from pathlib import Path\nimport requests\n\nDATA_PATH = Path(\"data\")\nPATH = DATA_PATH / \"mnist\"\n\nPATH.mkdir(parents=True, exist_ok=True)\n\nURL = \"https://github.com/pytorch/tutorials/raw/master/_static/\"\nFILENAME = \"mnist.pkl.gz\"\n\nif not (PATH / FILENAME).exists():\n        content = requests.get(URL + FILENAME).content\n        (PATH / FILENAME).open(\"wb\").write(content)"
+        "from pathlib import Path\nimport requests\n\nDATA_PATH = Path(\"data\")\nPATH = DATA_PATH / \"mnist\"\n\nPATH.mkdir(parents=True, exist_ok=True)\n\nURL = \"https://github.com/pytorch/tutorials/raw/main/_static/\"\nFILENAME = \"mnist.pkl.gz\"\n\nif not (PATH / FILENAME).exists():\n        content = requests.get(URL + FILENAME).content\n        (PATH / FILENAME).open(\"wb\").write(content)"
       ]
     },
     {
 
@@ -53,7 +53,7 @@
 
 PATH.mkdir(parents=True, exist_ok=True)
 
-URL = "https://github.com/pytorch/tutorials/raw/master/_static/"
+URL = "https://github.com/pytorch/tutorials/raw/main/_static/"
 FILENAME = "mnist.pkl.gz"
 
 if not (PATH / FILENAME).exists():
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@`
`22`	`22`	`"cell_type": "markdown",`
`23`	`23`	`"metadata": {},`
`24`	`24`	`"source": [`
`25`		- "## \ub370\uc774\ud130 \ucc98\ub9ac\ud558\uae30\n\n``torchtext`` \uc5d0\ub294 \uc5b8\uc5b4 \ubcc0\ud658 \ubaa8\ub378\uc744 \ub9cc\ub4e4 \ub54c \uc27d\uac8c \uc0ac\uc6a9\ud560 \uc218 \uc788\ub294 \ub370\uc774\ud130\uc14b\uc744 \ub9cc\ub4e4\uae30 \uc801\ud569\ud55c \ub2e4\uc591\ud55c \ub3c4\uad6c\uac00 \uc788\uc2b5\ub2c8\ub2e4.\n\uc774 \uc608\uc81c\uc5d0\uc11c\ub294 \uac00\uacf5\ub418\uc9c0 \uc54a\uc740 \ud14d\uc2a4\ud2b8 \ubb38\uc7a5(raw text sentence)\uc744 \ud1a0\ud070\ud654(tokenize)\ud558\uace0, \uc5b4\ud718\uc9d1(vocabulary)\uc744 \ub9cc\ub4e4\uace0,\n\ud1a0\ud070\uc744 \ud150\uc11c\ub85c \uc22b\uc790\ud654(numericalize)\ud558\ub294 \ubc29\ubc95\uc744 \uc54c\uc544\ubcf4\uaca0\uc2b5\ub2c8\ub2e4.\n\n\ucc38\uace0 : \uc774 \ud29c\ud1a0\ub9ac\uc5bc\uc5d0\uc11c\uc758 \ud1a0\ud070\ud654(tokenization)\uc5d0\ub294 [Spacy](https://spacy.io)_ \uac00 \ud544\uc694\ud569\ub2c8\ub2e4.\nSpacy\ub294 \uc601\uc5b4 \uc774 \uc678\uc758 \ub2e4\ub978 \uc5b8\uc5b4\uc5d0 \ub300\ud55c \uac15\ub825\ud55c \ud1a0\ud070\ud654 \uae30\ub2a5\uc744 \uc81c\uacf5\ud558\uae30 \ub54c\ubb38\uc5d0 \uc0ac\uc6a9\ud569\ub2c8\ub2e4. ``torchtext`` \ub294\n`basic_english`` \ud1a0\ud06c\ub098\uc774\uc800\ub97c \uc81c\uacf5\ud560 \ubfd0 \uc544\ub2c8\ub77c \uc601\uc5b4\uc5d0 \uc0ac\uc6a9\ud560 \uc218 \uc788\ub294 \ub2e4\ub978 \ud1a0\ud06c\ub098\uc774\uc800\ub4e4(\uc608\ucee8\ub370\n[Moses](https://bitbucket.org/luismsgomes/mosestokenizer/src/default/)_ )\uc744 \uc9c0\uc6d0\ud569\ub2c8\ub2e4\ub9cc, \uc5b8\uc5b4 \ubc88\uc5ed\uc744 \uc704\ud574\uc11c\ub294 \ub2e4\uc591\ud55c \uc5b8\uc5b4\ub97c\n\ub2e4\ub8e8\uc5b4\uc57c \ud558\uae30 \ub54c\ubb38\uc5d0 Spacy\uac00 \uac00\uc7a5 \uc801\ud569\ud569\ub2c8\ub2e4.\n\n\uc774 \ud29c\ud1a0\ub9ac\uc5bc\uc744 \uc2e4\ud589\ud558\ub824\uba74, \uc6b0\uc120 ``pip`` \ub098 ``conda`` \ub85c ``spacy`` \ub97c \uc124\uce58\ud558\uc138\uc694. \uadf8 \ub2e4\uc74c,\nSpacy \ud1a0\ud06c\ub098\uc774\uc800\uac00 \uc4f8 \uc601\uc5b4\uc640 \ub3c5\uc77c\uc5b4\uc5d0 \ub300\ud55c \ub370\uc774\ud130\ub97c \ub2e4\uc6b4\ub85c\ub4dc \ubc1b\uc2b5\ub2c8\ub2e4.\n\n::\n\n python -m spacy download en\n python -m spacy download de\n\n"
	`25`	+ "## \ub370\uc774\ud130 \ucc98\ub9ac\ud558\uae30\n\n``torchtext`` \uc5d0\ub294 \uc5b8\uc5b4 \ubcc0\ud658 \ubaa8\ub378\uc744 \ub9cc\ub4e4 \ub54c \uc27d\uac8c \uc0ac\uc6a9\ud560 \uc218 \uc788\ub294 \ub370\uc774\ud130\uc14b\uc744 \ub9cc\ub4e4\uae30 \uc801\ud569\ud55c \ub2e4\uc591\ud55c \ub3c4\uad6c\uac00 \uc788\uc2b5\ub2c8\ub2e4.\n\uc774 \uc608\uc81c\uc5d0\uc11c\ub294 \uac00\uacf5\ub418\uc9c0 \uc54a\uc740 \ud14d\uc2a4\ud2b8 \ubb38\uc7a5(raw text sentence)\uc744 \ud1a0\ud070\ud654(tokenize)\ud558\uace0, \uc5b4\ud718\uc9d1(vocabulary)\uc744 \ub9cc\ub4e4\uace0,\n\ud1a0\ud070\uc744 \ud150\uc11c\ub85c \uc22b\uc790\ud654(numericalize)\ud558\ub294 \ubc29\ubc95\uc744 \uc54c\uc544\ubcf4\uaca0\uc2b5\ub2c8\ub2e4.\n\n\ucc38\uace0 : \uc774 \ud29c\ud1a0\ub9ac\uc5bc\uc5d0\uc11c\uc758 \ud1a0\ud070\ud654(tokenization)\uc5d0\ub294 [Spacy](https://spacy.io)_ \uac00 \ud544\uc694\ud569\ub2c8\ub2e4.\nSpacy\ub294 \uc601\uc5b4 \uc774 \uc678\uc758 \ub2e4\ub978 \uc5b8\uc5b4\uc5d0 \ub300\ud55c \uac15\ub825\ud55c \ud1a0\ud070\ud654 \uae30\ub2a5\uc744 \uc81c\uacf5\ud558\uae30 \ub54c\ubb38\uc5d0 \uc0ac\uc6a9\ud569\ub2c8\ub2e4. ``torchtext`` \ub294\n`basic_english`` \ud1a0\ud06c\ub098\uc774\uc800\ub97c \uc81c\uacf5\ud560 \ubfd0 \uc544\ub2c8\ub77c \uc601\uc5b4\uc5d0 \uc0ac\uc6a9\ud560 \uc218 \uc788\ub294 \ub2e4\ub978 \ud1a0\ud06c\ub098\uc774\uc800\ub4e4(\uc608\ucee8\ub370\n[Moses](https://bitbucket.org/luismsgomes/mosestokenizer/src/default/)_ )\uc744 \uc9c0\uc6d0\ud569\ub2c8\ub2e4\ub9cc, \uc5b8\uc5b4 \ubc88\uc5ed\uc744 \uc704\ud574\uc11c\ub294 \ub2e4\uc591\ud55c \uc5b8\uc5b4\ub97c\n\ub2e4\ub8e8\uc5b4\uc57c \ud558\uae30 \ub54c\ubb38\uc5d0 Spacy\uac00 \uac00\uc7a5 \uc801\ud569\ud569\ub2c8\ub2e4.\n\n\uc774 \ud29c\ud1a0\ub9ac\uc5bc\uc744 \uc2e4\ud589\ud558\ub824\uba74, \uc6b0\uc120 ``pip`` \ub098 ``conda`` \ub85c ``spacy`` \ub97c \uc124\uce58\ud558\uc138\uc694. \uadf8 \ub2e4\uc74c,\nSpacy \ud1a0\ud06c\ub098\uc774\uc800\uac00 \uc4f8 \uc601\uc5b4\uc640 \ub3c5\uc77c\uc5b4\uc5d0 \ub300\ud55c \ub370\uc774\ud130\ub97c \ub2e4\uc6b4\ub85c\ub4dc \ubc1b\uc2b5\ub2c8\ub2e4.\n\n::\n\n python -m spacy download en_core_web_sm\n python -m spacy download de_core_news_sm\n\n"
`26`	`26`	`]`
`27`	`27`	`},`
`28`	`28`	`{`
`@@ -33,7 +33,7 @@`
`33`	`33`	`},`
`34`	`34`	`"outputs": [],`
`35`	`35`	`"source": [`
`36`		- "import torchtext\nimport torch\nfrom torchtext.data.utils import get_tokenizer\nfrom collections import Counter\nfrom torchtext.vocab import Vocab, vocab\nfrom torchtext.utils import download_from_url, extract_archive\nimport io\n\nurl_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'\ntrain_urls = ('train.de.gz', 'train.en.gz')\nval_urls = ('val.de.gz', 'val.en.gz')\ntest_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')\n\ntrain_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]\nval_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]\ntest_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]\n\nde_tokenizer = get_tokenizer('spacy', language='de')\nen_tokenizer = get_tokenizer('spacy', language='en')\n\ndef build_vocab(filepath, tokenizer):\n counter = Counter()\n with io.open(filepath, encoding=\"utf8\") as f:\n for string_ in f:\n counter.update(tokenizer(string_))\n return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])\n\nde_vocab = build_vocab(train_filepaths[0], de_tokenizer)\nen_vocab = build_vocab(train_filepaths[1], en_tokenizer)\n\ndef data_process(filepaths):\n raw_de_iter = iter(io.open(filepaths[0], encoding=\"utf8\"))\n raw_en_iter = iter(io.open(filepaths[1], encoding=\"utf8\"))\n data = []\n for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):\n de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],\n dtype=torch.long)\n en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],\n dtype=torch.long)\n data.append((de_tensor_, en_tensor_))\n return data\n\ntrain_data = data_process(train_filepaths)\nval_data = data_process(val_filepaths)\ntest_data = data_process(test_filepaths)"
	`36`	+ "import torchtext\nimport torch\nfrom torchtext.data.utils import get_tokenizer\nfrom collections import Counter\nfrom torchtext.vocab import vocab\nfrom torchtext.utils import download_from_url, extract_archive\nimport io\n\nurl_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'\ntrain_urls = ('train.de.gz', 'train.en.gz')\nval_urls = ('val.de.gz', 'val.en.gz')\ntest_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')\n\ntrain_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]\nval_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]\ntest_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]\n\nde_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')\nen_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')\n\ndef build_vocab(filepath, tokenizer):\n counter = Counter()\n with io.open(filepath, encoding=\"utf8\") as f:\n for string_ in f:\n counter.update(tokenizer(string_))\n return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])\n\nde_vocab = build_vocab(train_filepaths[0], de_tokenizer)\nen_vocab = build_vocab(train_filepaths[1], en_tokenizer)\n\ndef data_process(filepaths):\n raw_de_iter = iter(io.open(filepaths[0], encoding=\"utf8\"))\n raw_en_iter = iter(io.open(filepaths[1], encoding=\"utf8\"))\n data = []\n for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):\n de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],\n dtype=torch.long)\n en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],\n dtype=torch.long)\n data.append((de_tensor_, en_tensor_))\n return data\n\ntrain_data = data_process(train_filepaths)\nval_data = data_process(val_filepaths)\ntest_data = data_process(test_filepaths)"
`37`	`37`	`]`
`38`	`38`	`},`
`39`	`39`	`{`