Optimisation (#5)

kylase · web-flow · commit d77b92f55e33 · 2018-08-07T22:07:57.000+08:00
* Upgrade libraries and use KeyedVectors to load word vectors

* Use gensim native saved vectors instead

* Added tests and CI

* Updated with links to new word embeddings and some code cleaning
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,3 @@
 *.pyc
-
+.pytest_cache
 .venv
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,8 @@
+language: python
+cache: pip
+python:
+  - "2.7"
+install:
+  - pip install -r requirements/test.txt
+script:
+  - pytest
diff --git a/Dockerfile b/Dockerfile
@@ -6,7 +6,7 @@ RUN apt-get update \
   && apt-get install -y libopenblas-dev \
   && apt-get clean
 
-RUN pip install --no-cache-dir Theano==0.10.0beta4 numpy==1.13.3 gensim==0.13.2
+RUN pip install --no-cache-dir Theano==1.0.2 numpy==1.14.5 gensim==3.5.0
 
 RUN echo "[global]\nfloatX = float32" >> ~/.theanorc
 RUN echo "[blas]\nldflags = -lblas -lgfortran" >> ~/.theanorc
diff --git a/README.md b/README.md
@@ -1,5 +1,7 @@
 ## Neural ParsCit
 
+[![Build Status](https://travis-ci.com/WING-NUS/Neural-ParsCit.svg?branch=master)](https://travis-ci.com/WING-NUS/Neural-ParsCit)
+
 Neural ParsCit is a citation string parser which parses reference strings into its component tags such as Author, Journal, Location, Date, etc. Neural ParsCit uses Long Short Term Memory (LSTM), a deep learning model to parse the reference strings. This deep learning algorithm is chosen as it is designed to perform sequence-to-sequence labeling tasks such as ours. Input to the model are word embeddings which are vector representation of words. We provide word embeddings as well as character embeddings as input to the network.
 
 
@@ -15,14 +17,20 @@ source .venv/bin/activate
 pip install -r requirements.txt
 ```
 
+### Word Embeddings
+
+The word embeddings does not come with this repository. You can obtain the [word embeddings](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/vectors.tar.gz) and the [word frequency](http://wing.comp.nus.edu.sg/~wing.nus/resources/NParsCit/freq) from WING website.
+
+You will need to extract the content of the word embedding archive (`vectors.tar.gz`) to the root directory for this repository by running `tar xfz vectors.tar.gz`.
+
 ### Using Docker
 
 1. Build the image: `docker build -t theano-gensim - < Dockerfile`
 1. Run the repo mounted to the container: `docker run -it -v /path/to/Neural-ParsCit:/usr/src --name np theano-gensim:latest /bin/bash`
 
 ## Parse citation strings
 
-The fastest way to use the parser is to run state-of-the-art pretrained model as follows:
+The fastest way to use the parser is to run state-of-the-art pre-trained model as follows:
 
 ```
 ./run.py --model_path models/neuralParsCit/ --pre_emb <vectors.bin> --run shell
@@ -50,10 +58,7 @@ There are many parameters you can tune (CRF, dropout rate, embedding dimension,
 
 Input files for the training script have to follow the following format: each word of the citation string and its corresponding tag has to be on a separate line. All citation strings must be separated by a blank line.
 
-
-If you want to use the word embeddings trained on ACM refrences, and the freq., please download from WING homepage: http://wing.comp.nus.edu.sg/?page_id=158 (currently not avaible due to space issue, mail animesh@comp.nus.edu.sg, animeshprasad3@gmail.com for a copy)
-
-Details about the training data, experiments can be found in the following article. Traning data and CRF baseline can be downloaded from https://github.com/knmnyn/ParsCit. Please consider citing following piblication(s) if you use Neural ParsCit:
+Details about the training data, experiments can be found in the following article. Training data and CRF baseline can be downloaded from https://github.com/knmnyn/ParsCit. Please consider citing following publication(s) if you use Neural ParsCit:
 ```
 @article{animesh2018neuralparscit,
   title={Neural ParsCit: A Deep Learning Based Reference String Parser},
diff --git a/loader.py b/loader.py
@@ -172,7 +172,7 @@ def augment_with_pretrained(dictionary, ext_emb_path, words):
     #    if len(ext_emb_path) > 0
     #])
 
-    pretrained = gensim.models.word2vec.Word2Vec.load_word2vec_format(ext_emb_path, binary=True)
+    pretrained = gensim.models.KeyedVectors.load_word2vec_format(ext_emb_path, binary=True)
 
     # We either add every word in the pretrained file,
     # or only words given in the `words` list to which
diff --git a/model.py b/model.py
@@ -1,17 +1,20 @@
+from __future__ import print_function
+import logging
+import cPickle
 import os
 import re
 import numpy as np
 import scipy.io
 import theano
 import theano.tensor as T
-import codecs
-import cPickle
-import gensim
+from gensim.models import KeyedVectors
 
 from utils import shared, set_values, get_name
 from nn import HiddenLayer, EmbeddingLayer, DropoutLayer, LSTM, forward
 from optimization import Optimization
 
+logging.basicConfig(format="%(asctime)-15s %(message)s", level=logging.INFO)
+logger = logging.getLogger
 
 class Model(object):
     """
@@ -88,7 +91,7 @@ def save(self):
         """
         Write components values to disk.
         """
-        print "Saving parameter values to disk"
+        logging.info("Saving parameter values to disk")
         for name, param in self.components.items():
             param_path = os.path.join(self.model_path, "%s.mat" % name)
             if hasattr(param, 'params'):
@@ -97,7 +100,7 @@ def save(self):
                 param_values = {name: param.get_value()}
             #No need to save embeding values as they are never updated
             #directly use the pretrained embeddings file
-            if name=='word_layer':
+            if name == 'word_layer':
                 continue
             else:
                 scipy.io.savemat(param_path, param_values)
@@ -109,7 +112,7 @@ def reload(self):
         for name, param in self.components.items():
             param_path = os.path.join(self.model_path, "%s.mat" % name)
             #load word layer during build from pretrained embeddings file.
-            if name=='word_layer':
+            if name == 'word_layer':
                 continue
             else:
                 param_values = scipy.io.loadmat(param_path)
@@ -133,7 +136,7 @@ def build(self,
               cap_dim,
               training=True,
               **kwargs
-              ):
+             ):
         """
         Build the network.
         """
@@ -163,23 +166,20 @@ def build(self,
         input_dim = 0
         inputs = []
 
-        #
         # Word inputs
-        #
         if word_dim:
             input_dim += word_dim
-            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
+            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer', train=training)
             word_input = word_layer.link(word_ids)
             inputs.append(word_input)
             # Initialize with pretrained embeddings
             if pre_emb and training:
                 new_weights = word_layer.embeddings.get_value()
-                print 'Loading pretrained embeddings from %s...' % pre_emb
-                pretrained = {}
+                logging.info("Loading pretrained embeddings from %s...", pre_emb)
                 emb_invalid = 0
 
                 #use gensim models as pretrained embeddings
-                pretrained = gensim.models.word2vec.Word2Vec.load_word2vec_format(pre_emb, binary=True)
+                pretrained = KeyedVectors.load(pre_emb, mmap='r')
 
 #                for i, line in enumerate(codecs.open(pre_emb, 'r', 'cp850')):
 #                    line = line.rstrip().split()
@@ -196,30 +196,26 @@ def build(self,
                 c_lower = 0
                 c_zeros = 0
                 # Lookup table initialization
-                for i in xrange(n_words):
+                for i in range(n_words):
                     word = self.id_to_word[i]
                     if word in pretrained:
                         new_weights[i] = pretrained[word]
                         c_found += 1
                     elif word.lower() in pretrained:
                         new_weights[i] = pretrained[word.lower()]
                         c_lower += 1
-                    elif re.sub('\d', '0', word.lower()) in pretrained:
+                    elif re.sub(r'\d', '0', word.lower()) in pretrained:
                         new_weights[i] = pretrained[
-                            re.sub('\d', '0', word.lower())
+                            re.sub(r'\d', '0', word.lower())
                         ]
                         c_zeros += 1
                 word_layer.embeddings.set_value(new_weights)
 #                print 'Loaded %i pretrained embeddings.' % len(pretrained)
-                print ('%i / %i (%.4f%%) words have been initialized with '
-                       'pretrained embeddings.') % (
-                            c_found + c_lower + c_zeros, n_words,
-                            100. * (c_found + c_lower + c_zeros) / n_words
-                      )
-                print ('%i found directly, %i after lowercasing, '
-                       '%i after lowercasing + zero.') % (
-                          c_found, c_lower, c_zeros
-                      )
+                logging.info('%i / %i (%.4f%%) words have been initialized with '
+                             'pretrained embeddings.', c_found + c_lower + c_zeros,
+                             n_words, 100. * (c_found + c_lower + c_zeros) / n_words)
+                logging.info('%i found directly, %i after lowercasing, '
+                             '%i after lowercasing + zero.', c_found, c_lower, c_zeros)
 
         #
         # Chars inputs
@@ -384,7 +380,7 @@ def build(self,
             lr_method_parameters = {}
 
         # Compile training function
-        print 'Compiling...'
+        logging.info('Compiling...')
         if training:
             updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters)
             f_train = theano.function(
@@ -412,3 +408,13 @@ def build(self,
             )
 
         return f_train, f_eval
+
+    @staticmethod
+    def load_word_embeddings(embeddings, mode='r'):
+        if isinstance(embeddings, KeyedVectors):
+            return embeddings
+        else:
+            if os.path.isfile(embeddings) and os.path.isfile(embeddings + 'vectors.npy'):
+                return KeyedVectors.load(embeddings, mmap=mode)
+            else:
+                raise IOError("{embeddings} cannot be found.".format(embeddings=embeddings))
diff --git a/nn.py b/nn.py
@@ -59,18 +59,20 @@ class EmbeddingLayer(object):
     Output: tensor of dimension (dim*, output_dim)
     """
 
-    def __init__(self, input_dim, output_dim, name='embedding_layer'):
+    def __init__(self, input_dim, output_dim, name='embedding_layer', train=True):
         """
         Typically, input_dim is the vocabulary size,
         and output_dim the embedding dimension.
         """
         self.input_dim = input_dim
         self.output_dim = output_dim
         self.name = name
+        self.train = train
 
         # Randomly generate weights
         self.embeddings = shared((input_dim, output_dim),
-                                 self.name + '__embeddings')
+                                 self.name + '__embeddings',
+                                 train=self.train)
 
         # Define parameters
         self.params = [self.embeddings]
diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -1 +1,4 @@
 -r prod.txt
+pylint==1.9.2
+pytest==3.5.1
+ipython==5.7.0
diff --git a/requirements/prod.txt b/requirements/prod.txt
@@ -1,3 +1,3 @@
-gensim==0.13.2
-theano==0.10.b4
-numpy==1.13.3
+gensim==3.5.0
+theano==1.0.2
+numpy==1.14.5
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -0,0 +1,3 @@
+-r prod.txt
+pylint==1.9.2
+pytest==3.5.1
diff --git a/run.py b/run.py
@@ -1,9 +1,15 @@
+from __future__ import print_function
+import os
+import re
+import optparse
+import json
+import numpy as np
+import theano
+from gensim.models import KeyedVectors
 from utils import evaluate, create_input
 from model import Model
 from loader import augment_with_pretrained, load_sentences, prepare_dataset
-import numpy as np
-import itertools
-import gensim, re, optparse, json, os
+
 
 optparser = optparse.OptionParser()
 optparser.add_option(
@@ -28,40 +34,36 @@
 )
 opts = optparser.parse_args()[0]
 
-model_path = opts.model_path
-model = Model(model_path=model_path)
+model = Model(model_path=opts.model_path)
 f = model.build(training=False, **model.parameters)
 model.reload()
 
 model.parameters['pre_emb'] = opts.pre_emb
-pretrained = gensim.models.word2vec.Word2Vec.load_word2vec_format(model.parameters['pre_emb'], binary=True)
-new_weights = model.components['word_layer'].embeddings.get_value()
+pretrained = KeyedVectors.load(model.parameters['pre_emb'], mmap='r')
 n_words = len(model.id_to_word)
 
 #only include pretrained embeddings for 640780 most frequent words
-freq = json.load(open('freq', 'r'))
-words = [item[0] for item in freq]
+words = [item[0] for item in json.load(open('freq', 'r'))]
 
 #Create new mapping because model.id_to_word only is an Ordered dict of only training and testing data
 model.id_to_word = {}
 
-discarded=640780
-for i in xrange((n_words/2), n_words):
+discarded = 640780
+new_weights = np.empty((n_words - n_words/2 + 1, 500), dtype=theano.config.floatX)
+for i in range((n_words/2), n_words):
     word = words[i]
+    lower = word.lower()
+    digits = re.sub(r'\d', '0', lower)
+    idx = i - discarded
     if word in pretrained:
-        model.id_to_word[i-discarded] = word
-        new_weights[i-discarded] = pretrained[word]
-#        c_found += 1
-    elif word.lower() in pretrained:
-        model.id_to_word[i-discarded] = word.lower()
-        new_weights[i-discarded] = pretrained[word.lower()]
-#        c_lower += 1
-    elif re.sub('\d', '0', word.lower()) in pretrained:
-        model.id_to_word[i-discarded] = re.sub('\d', '0', word.lower())
-        new_weights[i-discarded] = pretrained[
-            re.sub('\d', '0', word.lower())
-        ]
- #       c_zeros += 1
+        model.id_to_word[idx] = word
+        new_weights[idx] = pretrained[word]
+    elif lower in pretrained:
+        model.id_to_word[idx] = lower
+        new_weights[idx] = pretrained[lower]
+    elif digits in pretrained:
+        model.id_to_word[idx] = digits
+        new_weights[idx] = pretrained[digits]
 
 model.id_to_word[0] = '<UNK>'
 #Reset the values of word layer
@@ -76,27 +78,8 @@
 word_to_id = {v:i for i,v in model.id_to_word.items()}
 char_to_id = {v:i for i,v in model.id_to_char.items()}
 
-
-def cap_feature(s):
-    """
-    Capitalization feature:
-    0 = low caps
-    1 = all caps
-    2 = first letter caps
-    3 = one capital (not first letter)
-    """
-    if s.lower() == s:
-        return 0
-    elif s.upper() == s:
-        return 1
-    elif s[0].upper() == s[0]:
-        return 2
-    else:
-        return 3
-
 while True:
-    run_option = opts.run
-    if run_option=='file':
+    if opts.run == 'file':
         assert opts.input_file
         assert opts.output_file
         input_file = opts.input_file
@@ -116,19 +99,19 @@ def cap_feature(s):
     test_sentences = load_sentences(test_file, lower, zeros)
     data = prepare_dataset(test_sentences, word_to_id, char_to_id, lower, True)
     for citation in data:
-        input = create_input(citation, model.parameters, False)
-        y_pred = np.array(f[1](*input))[1:-1]
+        inputs = create_input(citation, model.parameters, False)
+        y_pred = np.array(f[1](*inputs))[1:-1]
         tags = []
-        for i in xrange(len(y_pred)):
+        for i in range(len(y_pred)):
             tags.append(model.id_to_tag[y_pred[i]])
         output = []
         for num, word in enumerate(citation['str_words']):
             output.append(word+'\t'+tags[num])
-        if run_option=='file':
+        if opts.run == 'file':
             file = open(output_file, 'w')
             file.write('\n'.join(output))
             file.close()
         else:
-            print '\n'.join(output)
-    if run_option=='file':
+            print('\n'.join(output))
+    if opts.run == 'file':
         break
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/models/__init__.py b/tests/models/__init__.py
diff --git a/tests/models/test_load_word_embedding.py b/tests/models/test_load_word_embedding.py
diff --git a/utils.py b/utils.py

-Original file line number
+Diff line change
@@ @@ -1,3 +1,3 @@ @@
 *.pyc
+-
 +.pytest_cache
 .venv
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+-r prod.txt`
	`2`	`+pylint==1.9.2`
	`3`	`+pytest==3.5.1`