-
Notifications
You must be signed in to change notification settings - Fork 45
Expand file tree
/
Copy pathch6-1_a.py
More file actions
119 lines (94 loc) · 4.12 KB
/
ch6-1_a.py
File metadata and controls
119 lines (94 loc) · 4.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import sys
use_cntk = True
if use_cntk:
try:
base_directory = os.path.split(sys.executable)[0]
os.environ['PATH'] += ';' + base_directory
import cntk
os.environ['KERAS_BACKEND'] = 'cntk'
except ImportError:
print('CNTK not installed')
else:
os.environ['KERAS_BACKEND'] = 'tensorflow'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
##################################################################
# WORLD LEVEL ENCODING
##################################################################
import numpy as np
# This is our initial data; one entry per "sample"
# (in this toy example, a "sample" is just a sentence, but
# it could be an entire document).
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
# First, build an index of all tokens in the data.
token_index = {}
for sample in samples:
# We simply tokenize the samples via the `split` method.
# in real life, we would also strip punctuation and special characters
# from the samples.
for word in sample.split():
if word not in token_index:
# Assign a unique index to each unique word
token_index[word] = len(token_index) + 1
# Note that we don't attribute index 0 to anything.
# Next, we vectorize our samples.
# We will only consider the first `max_length` words in each sample.
max_length = 10
# This is where we store our results:
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1), dtype=np.uint8)
for i, sample in enumerate(samples):
for j, word in list(enumerate(sample.split()))[:max_length]:
index = token_index.get(word)
results[i, j, index] = 1.
print('Word-Level Encoding')
print(results)
##################################################################
# CHARACTER LEVEL ONE-HOT ENCODING
##################################################################
import string
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
characters = string.printable # All printable ASCII characters.
token_index = dict(zip(characters, range(1, len(characters) + 1)))
max_length = 50
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
for i, sample in enumerate(samples):
for j, character in enumerate(sample[:max_length]):
index = token_index.get(character)
results[i, j, index] = 1.
print('\nCharacter Level One-Hot Encoding')
print(results)
##################################################################
# Using Keras For Word-Level One-Hot Encoding
##################################################################
from keras.preprocessing.text import Tokenizer
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
# We create a tokenizer, configured to only take
# into account the top-1000 most common words
tokenizer = Tokenizer(num_words=1000)
# This builds the word index
tokenizer.fit_on_texts(samples)
# This turns strings into lists of integer indices.
sequences = tokenizer.texts_to_sequences(samples)
# You could also directly get the one-hot binary representations.
# Note that other vectorization modes than one-hot encoding are supported!
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
# This is how you can recover the word index that was computed
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
##################################################################
# Hashing Trick
##################################################################
samples = ['The cat sat on the mat.', 'The dog ate my homework.']
# We will store our words as vectors of size 1000.
# Note that if you have close to 1000 words (or more)
# you will start seeing many hash collisions, which
# will decrease the accuracy of this encoding method.
dimensionality = 1000
max_length = 10
results = np.zeros((len(samples), max_length, dimensionality))
for i, sample in enumerate(samples):
for j, word in list(enumerate(sample.split()))[:max_length]:
# Hash the word into a "random" integer index
# that is between 0 and 1000
index = abs(hash(word)) % dimensionality
results[i, j, index] = 1.