Skip to content

Commit f4ad0ba

Browse files
authored
Word embeddings loaded directly (#6)
* Model Evaluation (regression on v1.0.2) * Improved peak memory usage
1 parent d77b92f commit f4ad0ba

File tree

9 files changed

+161
-83
lines changed

9 files changed

+161
-83
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ python:
55
install:
66
- pip install -r requirements/test.txt
77
script:
8-
- pytest
8+
- pytest -rs

Dockerfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
FROM python:2
22

3+
ENV ENVIRONMENT prod
4+
35
WORKDIR /usr/src
46

57
RUN apt-get update \

README.md

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,23 @@ source .venv/bin/activate
1717
pip install -r requirements.txt
1818
```
1919

20-
### Word Embeddings
21-
22-
The word embeddings does not come with this repository. You can obtain the [word embeddings](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/vectors.tar.gz) and the [word frequency](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/freq) from WING website.
23-
24-
You will need to extract the content of the word embedding archive (`vectors.tar.gz`) to the root directory for this repository by running `tar xfz vectors.tar.gz`.
25-
2620
### Using Docker
2721

2822
1. Build the image: `docker build -t theano-gensim - < Dockerfile`
2923
1. Run the repo mounted to the container: `docker run -it -v /path/to/Neural-ParsCit:/usr/src --name np theano-gensim:latest /bin/bash`
3024

25+
## Word Embeddings
26+
27+
The word embeddings do not come with this repository. You can obtain the [word embeddings without `<UNK>`](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/vectors.tar.gz) (not recommended for v1.0.3) or [word embeddings with `<UNK>`](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/vectors_with_unk.tar.gz) and the [word frequency](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/freq) (deprecated in v1.0.3 as the entire word vectors can be loaded with less memory) from WING website. Please read the next section on availability of `<UNK>` in word embeddings.
28+
29+
You will need to extract the content of the word embedding archive (`vectors_with_unk.tar.gz`) to the root directory for this repository by running `tar xfz vectors_with_unk.tar.gz`.
30+
31+
### Embeddings Without `<UNK>`
32+
33+
If the word embeddings provided do not have `<UNK>`, your instance will not benefit from the lazy loading of the word vectors and hence the reduction of memory requirements.
34+
35+
Without `<UNK>`, at most 7.5 GB of memory is required as the entire word vectors need to be instantiated in memory to create the new matrix. Comparing with embeddings with `<UNK>`, which is much lower as it only requires at most 4.5 GB.
36+
3137
## Parse citation strings
3238

3339
The fastest way to use the parser is to run state-of-the-art pre-trained model as follows:

model.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
from nn import HiddenLayer, EmbeddingLayer, DropoutLayer, LSTM, forward
1414
from optimization import Optimization
1515

16-
logging.basicConfig(format="%(asctime)-15s %(message)s", level=logging.INFO)
16+
logging.basicConfig(format="[%(levelname)s] %(asctime)-15s: %(message)s",
17+
level=logging.INFO)
1718
logger = logging.getLogger
1819

1920
class Model(object):
@@ -169,18 +170,14 @@ def build(self,
169170
# Word inputs
170171
if word_dim:
171172
input_dim += word_dim
172-
word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer', train=training)
173-
word_input = word_layer.link(word_ids)
174-
inputs.append(word_input)
175173
# Initialize with pretrained embeddings
176-
if pre_emb and training:
174+
pretrained = self.load_word_embeddings(pre_emb)
175+
if training:
176+
word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
177177
new_weights = word_layer.embeddings.get_value()
178178
logging.info("Loading pretrained embeddings from %s...", pre_emb)
179179
emb_invalid = 0
180180

181-
#use gensim models as pretrained embeddings
182-
pretrained = KeyedVectors.load(pre_emb, mmap='r')
183-
184181
# for i, line in enumerate(codecs.open(pre_emb, 'r', 'cp850')):
185182
# line = line.rstrip().split()
186183
# if len(line) == word_dim + 1:
@@ -216,8 +213,13 @@ def build(self,
216213
n_words, 100. * (c_found + c_lower + c_zeros) / n_words)
217214
logging.info('%i found directly, %i after lowercasing, '
218215
'%i after lowercasing + zero.', c_found, c_lower, c_zeros)
216+
else:
217+
word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer',
218+
pretrained=pretrained)
219+
self.id_to_word.update({i: w for i, w in enumerate(pretrained.index2entity)})
219220

220-
#
221+
word_input = word_layer.link(word_ids)
222+
inputs.append(word_input)
221223
# Chars inputs
222224
#
223225
if char_dim:
@@ -414,7 +416,8 @@ def load_word_embeddings(embeddings, mode='r'):
414416
if isinstance(embeddings, KeyedVectors):
415417
return embeddings
416418
else:
417-
if os.path.isfile(embeddings) and os.path.isfile(embeddings + 'vectors.npy'):
418-
return KeyedVectors.load(embeddings, mmap=mode)
419+
if os.path.isfile(embeddings) and os.path.isfile(embeddings + '.vectors.npy'):
420+
v = KeyedVectors.load(embeddings, mmap=mode)
421+
return v
419422
else:
420423
raise IOError("{embeddings} cannot be found.".format(embeddings=embeddings))

nn.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
import logging
12
import theano
3+
import numpy as np
24
import theano.tensor as T
35
from utils import shared
46

@@ -59,20 +61,32 @@ class EmbeddingLayer(object):
5961
Output: tensor of dimension (dim*, output_dim)
6062
"""
6163

62-
def __init__(self, input_dim, output_dim, name='embedding_layer', train=True):
64+
def __init__(self, input_dim, output_dim, name='embedding_layer', pretrained=None):
6365
"""
6466
Typically, input_dim is the vocabulary size,
6567
and output_dim the embedding dimension.
6668
"""
6769
self.input_dim = input_dim
6870
self.output_dim = output_dim
6971
self.name = name
70-
self.train = train
71-
72-
# Randomly generate weights
73-
self.embeddings = shared((input_dim, output_dim),
74-
self.name + '__embeddings',
75-
train=self.train)
72+
if pretrained:
73+
if u'<UNK>' not in pretrained:
74+
logging.warn('<UNK> is not found in the pretrained and will be added.'
75+
'This will consume more memory than usual.')
76+
pretrained.add([u'<UNK>'],
77+
[np.zeros((pretrained.vectors.shape[1], ),
78+
dtype=theano.config.floatX)])
79+
80+
if pretrained.vectors.dtype == theano.config.floatX:
81+
self.embeddings = theano.shared(value=pretrained.vectors,
82+
name=self.name + '__embeddings')
83+
else:
84+
self.embeddings = theano.shared(value=pretrained.vectors.astype(theano.config.floatX),
85+
name=self.name + '__embeddings')
86+
else:
87+
# Randomly generate weights
88+
self.embeddings = shared((input_dim, output_dim),
89+
self.name + '__embeddings')
7690

7791
# Define parameters
7892
self.params = [self.embeddings]

requirements/dev.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
-r prod.txt
2-
pylint==1.9.2
3-
pytest==3.5.1
2+
-r test.txt
43
ipython==5.7.0
4+
git+https://github.com/pytorch/text.git@master
5+
torch==0.4.1
6+
sklearn==0.19.2

run.py

Lines changed: 18 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import json
66
import numpy as np
77
import theano
8-
from gensim.models import KeyedVectors
8+
from contextlib import closing
99
from utils import evaluate, create_input
1010
from model import Model
1111
from loader import augment_with_pretrained, load_sentences, prepare_dataset
@@ -35,42 +35,10 @@
3535
opts = optparser.parse_args()[0]
3636

3737
model = Model(model_path=opts.model_path)
38+
model.parameters['pre_emb'] = os.path.join(os.getcwd(), opts.pre_emb)
3839
f = model.build(training=False, **model.parameters)
39-
model.reload()
40-
41-
model.parameters['pre_emb'] = opts.pre_emb
42-
pretrained = KeyedVectors.load(model.parameters['pre_emb'], mmap='r')
43-
n_words = len(model.id_to_word)
44-
45-
#only include pretrained embeddings for 640780 most frequent words
46-
words = [item[0] for item in json.load(open('freq', 'r'))]
47-
48-
#Create new mapping because model.id_to_word only is an Ordered dict of only training and testing data
49-
model.id_to_word = {}
5040

51-
discarded = 640780
52-
new_weights = np.empty((n_words - n_words/2 + 1, 500), dtype=theano.config.floatX)
53-
for i in range((n_words/2), n_words):
54-
word = words[i]
55-
lower = word.lower()
56-
digits = re.sub(r'\d', '0', lower)
57-
idx = i - discarded
58-
if word in pretrained:
59-
model.id_to_word[idx] = word
60-
new_weights[idx] = pretrained[word]
61-
elif lower in pretrained:
62-
model.id_to_word[idx] = lower
63-
new_weights[idx] = pretrained[lower]
64-
elif digits in pretrained:
65-
model.id_to_word[idx] = digits
66-
new_weights[idx] = pretrained[digits]
67-
68-
model.id_to_word[0] = '<UNK>'
69-
#Reset the values of word layer
70-
model.components['word_layer'].embeddings.set_value(new_weights)
71-
#release memory occupied by word embeddings
72-
del pretrained
73-
del new_weights
41+
model.reload()
7442

7543
lower = model.parameters['lower']
7644
zeros = model.parameters['zeros']
@@ -82,36 +50,39 @@
8250
if opts.run == 'file':
8351
assert opts.input_file
8452
assert opts.output_file
85-
input_file = opts.input_file
53+
8654
output_file = opts.output_file
87-
data = open(input_file, 'r').read()
55+
56+
with closing(open(opts.input_file, 'r')) as fh:
57+
data = fh.read()
8858
strings = data.split('\n')
8959
else:
9060
string = raw_input("Enter the citation string: ")
9161
strings = [string]
62+
9263
test_file = "test_file"
9364
if os.path.exists(test_file):
9465
os.remove(test_file)
9566
file = open(test_file, 'a')
9667
for string in strings:
97-
file.write('\n'.join(string.split())+'\n')
68+
file.write('\n'.join(string.split()) + '\n')
9869
file.close()
9970
test_sentences = load_sentences(test_file, lower, zeros)
10071
data = prepare_dataset(test_sentences, word_to_id, char_to_id, lower, True)
72+
10173
for citation in data:
10274
inputs = create_input(citation, model.parameters, False)
10375
y_pred = np.array(f[1](*inputs))[1:-1]
104-
tags = []
105-
for i in range(len(y_pred)):
106-
tags.append(model.id_to_tag[y_pred[i]])
107-
output = []
108-
for num, word in enumerate(citation['str_words']):
109-
output.append(word+'\t'+tags[num])
76+
77+
tags = [model.id_to_tag[y_pred[i]] for i in range(len(y_pred))]
78+
79+
output = [w + '\t' + tags[i] for i, w in enumerate(citation['str_words'])]
80+
11081
if opts.run == 'file':
111-
file = open(output_file, 'w')
112-
file.write('\n'.join(output))
113-
file.close()
82+
with closing(open(output_file, 'w')) as fh:
83+
fh.write('\n'.join(output))
11484
else:
11585
print('\n'.join(output))
86+
11687
if opts.run == 'file':
11788
break

tests/models/test_inference.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import os
2+
import tempfile
3+
import pytest
4+
import requests
5+
import numpy as np
6+
7+
from model import Model
8+
from loader import load_sentences, prepare_dataset
9+
from utils import create_input
10+
11+
CORA_URL = "https://raw.githubusercontent.com/knmnyn/ParsCit/master/crfpp/traindata/cora.train"
12+
13+
# Skip this test when running in CI as the amount of memory is not sufficient
14+
# to build the model
15+
@pytest.mark.skipif(os.getenv("CI") == 'true', reason="Not running in CI")
16+
def test_inference_performance():
17+
from sklearn.metrics import f1_score
18+
from torchtext.datasets import SequenceTaggingDataset
19+
from torchtext.data import Field, NestedField
20+
21+
WORD = Field(init_token='<bos>', eos_token='<eos>')
22+
CHAR_NESTING = Field(tokenize=list, init_token='<bos>', eos_token='<eos>')
23+
CHAR = NestedField(CHAR_NESTING, init_token='<bos>', eos_token='<eos>')
24+
ENTITY = Field(init_token='<bos>', eos_token='<eos>')
25+
26+
data_file = tempfile.NamedTemporaryFile(delete=True)
27+
28+
# TODO Need to be decoded in Python 3
29+
data_file.write(requests.get(CORA_URL).content)
30+
31+
fields = [(('text', 'char'), (WORD, CHAR))] + [(None, None)] * 22 + [('entity', ENTITY)]
32+
33+
dataset = SequenceTaggingDataset(data_file.name, fields, separator=" ")
34+
35+
model = Model(model_path='models/neuralParsCit')
36+
model.parameters['pre_emb'] = os.path.join(os.getcwd(), 'vectors_with_unk.kv')
37+
f = model.build(training=False, **model.parameters)
38+
39+
model.reload()
40+
41+
word_to_id = {v:i for i, v in model.id_to_word.items()}
42+
char_to_id = {v:i for i, v in model.id_to_char.items()}
43+
tag_to_id = {tag: i for i, tag in model.id_to_tag.items()}
44+
45+
tf = tempfile.NamedTemporaryFile(delete=False)
46+
tf.write("\n\n".join(["\n".join(example.text) for example in dataset.examples]))
47+
tf.close()
48+
49+
train_sentences = load_sentences(tf.name,
50+
model.parameters['lower'],
51+
model.parameters['zeros'])
52+
53+
train_inputs = prepare_dataset(train_sentences,
54+
word_to_id,
55+
char_to_id,
56+
model.parameters['lower'], True)
57+
58+
preds = []
59+
60+
for citation in train_inputs:
61+
inputs = create_input(citation, model.parameters, False)
62+
y_pred = np.array(f[1](*inputs))[1:-1]
63+
64+
preds.append([(w, y_pred[i]) for i, w in enumerate(citation['str_words'])])
65+
66+
assert len(preds) == len(dataset.examples)
67+
68+
results = []
69+
70+
for P, T in zip(preds, dataset.examples):
71+
for p, t in zip(P, zip(T.text, T.entity)):
72+
results.append((p[1], tag_to_id[t[1]]))
73+
74+
pred, true = zip(*results)
75+
76+
eval_metrics = {
77+
'micro_f1': f1_score(true, pred, average='micro'),
78+
'macro_f1': f1_score(true, pred, average='macro')
79+
}
80+
81+
data_file.close()
82+
83+
assert eval_metrics == pytest.approx({'macro_f1': 0.98, 'micro_f1': 0.99}, abs=0.01)

utils.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,20 +41,17 @@ def set_values(name, param, pretrained):
4141
).astype(np.float32))
4242

4343

44-
def shared(shape, name, train=True):
44+
def shared(shape, name):
4545
"""
4646
Create a shared object of a numpy array.
4747
"""
48-
if train:
49-
if len(shape) == 1:
50-
value = np.zeros(shape) # bias are initialized with zeros
51-
else:
52-
drange = np.sqrt(6. / (np.sum(shape)))
53-
value = drange * np.random.uniform(low=-1.0, high=1.0, size=shape)
54-
return theano.shared(value=value.astype(theano.config.floatX), name=name)
48+
if len(shape) == 1:
49+
value = np.zeros(shape) # bias are initialized with zeros
5550
else:
56-
return theano.shared(value=np.zeros(shape, dtype=theano.config.floatX), name=name)
51+
drange = np.sqrt(6. / (np.sum(shape)))
52+
value = drange * np.random.uniform(low=-1.0, high=1.0, size=shape)
5753

54+
return theano.shared(value=value.astype(theano.config.floatX), name=name)
5855

5956
def create_dico(item_list):
6057
"""

0 commit comments

Comments
 (0)