|
36 | 36 | # |
37 | 37 | # :: |
38 | 38 | # |
39 | | -# python -m spacy download en |
40 | | -# python -m spacy download de |
| 39 | +# python -m spacy download en_core_web_sm |
| 40 | +# python -m spacy download de_core_news_sm |
41 | 41 |
|
42 | 42 | import torchtext |
43 | 43 | import torch |
44 | 44 | from torchtext.data.utils import get_tokenizer |
45 | 45 | from collections import Counter |
46 | | -from torchtext.vocab import Vocab |
| 46 | +from torchtext.vocab import vocab |
47 | 47 | from torchtext.utils import download_from_url, extract_archive |
48 | 48 | import io |
49 | 49 |
|
|
56 | 56 | val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls] |
57 | 57 | test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls] |
58 | 58 |
|
59 | | -de_tokenizer = get_tokenizer('spacy', language='de') |
60 | | -en_tokenizer = get_tokenizer('spacy', language='en') |
| 59 | +de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm') |
| 60 | +en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm') |
61 | 61 |
|
62 | 62 | def build_vocab(filepath, tokenizer): |
63 | 63 | counter = Counter() |
64 | 64 | with io.open(filepath, encoding="utf8") as f: |
65 | 65 | for string_ in f: |
66 | 66 | counter.update(tokenizer(string_)) |
67 | | - return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>']) |
| 67 | + return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>']) |
68 | 68 |
|
69 | 69 | de_vocab = build_vocab(train_filepaths[0], de_tokenizer) |
70 | 70 | en_vocab = build_vocab(train_filepaths[1], en_tokenizer) |
|
0 commit comments