Word embeddings loaded directly (#6)

kylase · web-flow · commit f4ad0baae45b · 2018-08-20T20:43:10.000+08:00
* Model Evaluation (regression on v1.0.2)
* Improved peak memory usage
diff --git a/.travis.yml b/.travis.yml
@@ -5,4 +5,4 @@ python:
 install:
   - pip install -r requirements/test.txt
 script:
-  - pytest
+  - pytest -rs
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,7 @@
 FROM python:2
 
+ENV ENVIRONMENT prod 
+
 WORKDIR /usr/src
 
 RUN apt-get update \
diff --git a/README.md b/README.md
@@ -17,17 +17,23 @@ source .venv/bin/activate
 pip install -r requirements.txt
 ```
 
-### Word Embeddings
-
-The word embeddings does not come with this repository. You can obtain the [word embeddings](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/vectors.tar.gz) and the [word frequency](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/freq) from WING website.
-
-You will need to extract the content of the word embedding archive (`vectors.tar.gz`) to the root directory for this repository by running `tar xfz vectors.tar.gz`.
-
 ### Using Docker
 
 1. Build the image: `docker build -t theano-gensim - < Dockerfile`
 1. Run the repo mounted to the container: `docker run -it -v /path/to/Neural-ParsCit:/usr/src --name np theano-gensim:latest /bin/bash`
 
+## Word Embeddings
+
+The word embeddings do not come with this repository. You can obtain the [word embeddings without `<UNK>`](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/vectors.tar.gz) (not recommended for v1.0.3) or [word embeddings with `<UNK>`](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/vectors_with_unk.tar.gz) and the [word frequency](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/freq) (deprecated in v1.0.3 as the entire word vectors can be loaded with less memory) from WING website. Please read the next section on availability of `<UNK>` in word embeddings.
+
+You will need to extract the content of the word embedding archive (`vectors_with_unk.tar.gz`) to the root directory for this repository by running `tar xfz vectors_with_unk.tar.gz`.
+
+### Embeddings Without `<UNK>`
+
+If the word embeddings provided do not have `<UNK>`, your instance will not benefit from the lazy loading of the word vectors and hence the reduction of memory requirements.
+
+Without `<UNK>`, at most 7.5 GB of memory is required as the entire word vectors need to be instantiated in memory to create the new matrix. Comparing with embeddings with `<UNK>`, which is much lower as it only requires at most 4.5 GB.
+
 ## Parse citation strings
 
 The fastest way to use the parser is to run state-of-the-art pre-trained model as follows:
diff --git a/model.py b/model.py
@@ -13,7 +13,8 @@
 from nn import HiddenLayer, EmbeddingLayer, DropoutLayer, LSTM, forward
 from optimization import Optimization
 
-logging.basicConfig(format="%(asctime)-15s %(message)s", level=logging.INFO)
+logging.basicConfig(format="[%(levelname)s] %(asctime)-15s: %(message)s",
+                    level=logging.INFO)
 logger = logging.getLogger
 
 class Model(object):
@@ -169,18 +170,14 @@ def build(self,
         # Word inputs
         if word_dim:
             input_dim += word_dim
-            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer', train=training)
-            word_input = word_layer.link(word_ids)
-            inputs.append(word_input)
             # Initialize with pretrained embeddings
-            if pre_emb and training:
+            pretrained = self.load_word_embeddings(pre_emb)
+            if training:
+                word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
                 new_weights = word_layer.embeddings.get_value()
                 logging.info("Loading pretrained embeddings from %s...", pre_emb)
                 emb_invalid = 0
 
-                #use gensim models as pretrained embeddings
-                pretrained = KeyedVectors.load(pre_emb, mmap='r')
-
 #                for i, line in enumerate(codecs.open(pre_emb, 'r', 'cp850')):
 #                    line = line.rstrip().split()
 #                    if len(line) == word_dim + 1:
@@ -216,8 +213,13 @@ def build(self,
                              n_words, 100. * (c_found + c_lower + c_zeros) / n_words)
                 logging.info('%i found directly, %i after lowercasing, '
                              '%i after lowercasing + zero.', c_found, c_lower, c_zeros)
+            else:
+                word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer',
+                                            pretrained=pretrained)
+                self.id_to_word.update({i: w for i, w in enumerate(pretrained.index2entity)})
 
-        #
+            word_input = word_layer.link(word_ids)
+            inputs.append(word_input)
         # Chars inputs
         #
         if char_dim:
@@ -414,7 +416,8 @@ def load_word_embeddings(embeddings, mode='r'):
         if isinstance(embeddings, KeyedVectors):
             return embeddings
         else:
-            if os.path.isfile(embeddings) and os.path.isfile(embeddings + 'vectors.npy'):
-                return KeyedVectors.load(embeddings, mmap=mode)
+            if os.path.isfile(embeddings) and os.path.isfile(embeddings + '.vectors.npy'):
+                v = KeyedVectors.load(embeddings, mmap=mode)
+                return v
             else:
                 raise IOError("{embeddings} cannot be found.".format(embeddings=embeddings))
diff --git a/nn.py b/nn.py
@@ -1,4 +1,6 @@
+import logging
 import theano
+import numpy as np
 import theano.tensor as T
 from utils import shared
 
@@ -59,20 +61,32 @@ class EmbeddingLayer(object):
     Output: tensor of dimension (dim*, output_dim)
     """
 
-    def __init__(self, input_dim, output_dim, name='embedding_layer', train=True):
+    def __init__(self, input_dim, output_dim, name='embedding_layer', pretrained=None):
         """
         Typically, input_dim is the vocabulary size,
         and output_dim the embedding dimension.
         """
         self.input_dim = input_dim
         self.output_dim = output_dim
         self.name = name
-        self.train = train
-
-        # Randomly generate weights
-        self.embeddings = shared((input_dim, output_dim),
-                                 self.name + '__embeddings',
-                                 train=self.train)
+        if pretrained:
+            if u'<UNK>' not in pretrained:
+                logging.warn('<UNK> is not found in the pretrained and will be added.'
+                             'This will consume more memory than usual.')
+                pretrained.add([u'<UNK>'],
+                               [np.zeros((pretrained.vectors.shape[1], ),
+                                         dtype=theano.config.floatX)])
+
+            if pretrained.vectors.dtype == theano.config.floatX:
+                self.embeddings = theano.shared(value=pretrained.vectors,
+                                                name=self.name + '__embeddings')
+            else:
+                self.embeddings = theano.shared(value=pretrained.vectors.astype(theano.config.floatX),
+                                                name=self.name + '__embeddings')
+        else:
+            # Randomly generate weights
+            self.embeddings = shared((input_dim, output_dim),
+                                     self.name + '__embeddings')
 
         # Define parameters
         self.params = [self.embeddings]
diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -1,4 +1,6 @@
 -r prod.txt
-pylint==1.9.2
-pytest==3.5.1
+-r test.txt
 ipython==5.7.0
+git+https://github.com/pytorch/text.git@master
+torch==0.4.1
+sklearn==0.19.2
diff --git a/run.py b/run.py
@@ -5,7 +5,7 @@
 import json
 import numpy as np
 import theano
-from gensim.models import KeyedVectors
+from contextlib import closing
 from utils import evaluate, create_input
 from model import Model
 from loader import augment_with_pretrained, load_sentences, prepare_dataset
@@ -35,42 +35,10 @@
 opts = optparser.parse_args()[0]
 
 model = Model(model_path=opts.model_path)
+model.parameters['pre_emb'] = os.path.join(os.getcwd(), opts.pre_emb)
 f = model.build(training=False, **model.parameters)
-model.reload()
-
-model.parameters['pre_emb'] = opts.pre_emb
-pretrained = KeyedVectors.load(model.parameters['pre_emb'], mmap='r')
-n_words = len(model.id_to_word)
-
-#only include pretrained embeddings for 640780 most frequent words
-words = [item[0] for item in json.load(open('freq', 'r'))]
-
-#Create new mapping because model.id_to_word only is an Ordered dict of only training and testing data
-model.id_to_word = {}
 
-discarded = 640780
-new_weights = np.empty((n_words - n_words/2 + 1, 500), dtype=theano.config.floatX)
-for i in range((n_words/2), n_words):
-    word = words[i]
-    lower = word.lower()
-    digits = re.sub(r'\d', '0', lower)
-    idx = i - discarded
-    if word in pretrained:
-        model.id_to_word[idx] = word
-        new_weights[idx] = pretrained[word]
-    elif lower in pretrained:
-        model.id_to_word[idx] = lower
-        new_weights[idx] = pretrained[lower]
-    elif digits in pretrained:
-        model.id_to_word[idx] = digits
-        new_weights[idx] = pretrained[digits]
-
-model.id_to_word[0] = '<UNK>'
-#Reset the values of word layer
-model.components['word_layer'].embeddings.set_value(new_weights)
-#release memory occupied by word embeddings
-del pretrained
-del new_weights
+model.reload()
 
 lower = model.parameters['lower']
 zeros = model.parameters['zeros']
@@ -82,36 +50,39 @@
     if opts.run == 'file':
         assert opts.input_file
         assert opts.output_file
-        input_file = opts.input_file
+
         output_file = opts.output_file
-        data = open(input_file, 'r').read()
+
+        with closing(open(opts.input_file, 'r')) as fh:
+            data = fh.read()
         strings = data.split('\n')
     else:
         string = raw_input("Enter the citation string: ")
         strings = [string]
+
     test_file = "test_file"
     if os.path.exists(test_file):
         os.remove(test_file)
     file = open(test_file, 'a')
     for string in strings:
-        file.write('\n'.join(string.split())+'\n')
+        file.write('\n'.join(string.split()) + '\n')
     file.close()
     test_sentences = load_sentences(test_file, lower, zeros)
     data = prepare_dataset(test_sentences, word_to_id, char_to_id, lower, True)
+
     for citation in data:
         inputs = create_input(citation, model.parameters, False)
         y_pred = np.array(f[1](*inputs))[1:-1]
-        tags = []
-        for i in range(len(y_pred)):
-            tags.append(model.id_to_tag[y_pred[i]])
-        output = []
-        for num, word in enumerate(citation['str_words']):
-            output.append(word+'\t'+tags[num])
+
+        tags = [model.id_to_tag[y_pred[i]] for i in range(len(y_pred))]
+
+        output = [w + '\t' + tags[i] for i, w in enumerate(citation['str_words'])]
+
         if opts.run == 'file':
-            file = open(output_file, 'w')
-            file.write('\n'.join(output))
-            file.close()
+            with closing(open(output_file, 'w')) as fh:
+                fh.write('\n'.join(output))
         else:
             print('\n'.join(output))
+
     if opts.run == 'file':
         break
diff --git a/tests/models/test_inference.py b/tests/models/test_inference.py
@@ -0,0 +1,83 @@
+import os
+import tempfile
+import pytest
+import requests
+import numpy as np
+
+from model import Model
+from loader import load_sentences, prepare_dataset
+from utils import create_input
+
+CORA_URL = "https://raw.githubusercontent.com/knmnyn/ParsCit/master/crfpp/traindata/cora.train"
+
+# Skip this test when running in CI as the amount of memory is not sufficient
+# to build the model
+@pytest.mark.skipif(os.getenv("CI") == 'true', reason="Not running in CI")
+def test_inference_performance():
+    from sklearn.metrics import f1_score
+    from torchtext.datasets import SequenceTaggingDataset
+    from torchtext.data import Field, NestedField
+
+    WORD = Field(init_token='<bos>', eos_token='<eos>')
+    CHAR_NESTING = Field(tokenize=list, init_token='<bos>', eos_token='<eos>')
+    CHAR = NestedField(CHAR_NESTING, init_token='<bos>', eos_token='<eos>')
+    ENTITY = Field(init_token='<bos>', eos_token='<eos>')
+
+    data_file = tempfile.NamedTemporaryFile(delete=True)
+
+    # TODO Need to be decoded in Python 3
+    data_file.write(requests.get(CORA_URL).content)
+
+    fields = [(('text', 'char'), (WORD, CHAR))] + [(None, None)] * 22 + [('entity', ENTITY)]
+
+    dataset = SequenceTaggingDataset(data_file.name, fields, separator=" ")
+
+    model = Model(model_path='models/neuralParsCit')
+    model.parameters['pre_emb'] = os.path.join(os.getcwd(), 'vectors_with_unk.kv')
+    f = model.build(training=False, **model.parameters)
+
+    model.reload()
+
+    word_to_id = {v:i for i, v in model.id_to_word.items()}
+    char_to_id = {v:i for i, v in model.id_to_char.items()}
+    tag_to_id = {tag: i for i, tag in model.id_to_tag.items()}
+
+    tf = tempfile.NamedTemporaryFile(delete=False)
+    tf.write("\n\n".join(["\n".join(example.text) for example in dataset.examples]))
+    tf.close()
+
+    train_sentences = load_sentences(tf.name,
+                                     model.parameters['lower'],
+                                     model.parameters['zeros'])
+
+    train_inputs = prepare_dataset(train_sentences,
+                                   word_to_id,
+                                   char_to_id,
+                                   model.parameters['lower'], True)
+
+    preds = []
+
+    for citation in train_inputs:
+        inputs = create_input(citation, model.parameters, False)
+        y_pred = np.array(f[1](*inputs))[1:-1]
+
+        preds.append([(w, y_pred[i]) for i, w in enumerate(citation['str_words'])])
+
+    assert len(preds) == len(dataset.examples)
+
+    results = []
+
+    for P, T in zip(preds, dataset.examples):
+        for p, t in zip(P, zip(T.text, T.entity)):
+            results.append((p[1], tag_to_id[t[1]]))
+
+    pred, true = zip(*results)
+
+    eval_metrics = {
+        'micro_f1': f1_score(true, pred, average='micro'),
+        'macro_f1': f1_score(true, pred, average='macro')
+    }
+
+    data_file.close()
+    
+    assert eval_metrics == pytest.approx({'macro_f1': 0.98, 'micro_f1': 0.99}, abs=0.01)
diff --git a/utils.py b/utils.py
@@ -41,20 +41,17 @@ def set_values(name, param, pretrained):
     ).astype(np.float32))
 
 
-def shared(shape, name, train=True):
+def shared(shape, name):
     """
     Create a shared object of a numpy array.
     """
-    if train:
-        if len(shape) == 1:
-            value = np.zeros(shape)  # bias are initialized with zeros
-        else:
-            drange = np.sqrt(6. / (np.sum(shape)))
-            value = drange * np.random.uniform(low=-1.0, high=1.0, size=shape)
-        return theano.shared(value=value.astype(theano.config.floatX), name=name)
+    if len(shape) == 1:
+        value = np.zeros(shape)  # bias are initialized with zeros
     else:
-        return theano.shared(value=np.zeros(shape, dtype=theano.config.floatX), name=name)
+        drange = np.sqrt(6. / (np.sum(shape)))
+        value = drange * np.random.uniform(low=-1.0, high=1.0, size=shape)
 
+    return theano.shared(value=value.astype(theano.config.floatX), name=name)
 
 def create_dico(item_list):
     """