Use all 3 datasets to build the vocabularies (#15)

kylase · web-flow · commit c7a2557030ee · 2019-01-26T23:03:05.000+08:00
Replaced hard cast to float32 which caused reloading of model to fail
diff --git a/app/resources/parscit.py b/app/resources/parscit.py
@@ -42,6 +42,7 @@ def post(self):
         data = prepare_dataset([[[token] for token in tokens]],
                                current_app.word_to_id,
                                current_app.char_to_id,
+                               {},
                                current_app.model.parameters['lower'],
                                True)
 
@@ -81,6 +82,7 @@ def post(self):
         data = prepare_dataset(tokens,
                                current_app.word_to_id,
                                current_app.char_to_id,
+                               {},
                                current_app.model.parameters['lower'],
                                True)
 
diff --git a/model.py b/model.py
@@ -302,8 +302,8 @@ def build(self,
             transitions = shared((n_tags + 2, n_tags + 2), 'transitions')
 
             small = -1000
-            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
-            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
+            b_s = np.array([[small] * n_tags + [0, small]]).astype(theano.config.floatX)
+            e_s = np.array([[small] * n_tags + [small, 0]]).astype(theano.config.floatX)
             observations = T.concatenate(
                 [tags_scores, small * T.ones((s_len, 2))],
                 axis=1
diff --git a/optimization.py b/optimization.py
@@ -154,7 +154,7 @@ def rmsprop(self, cost, params, lr=0.001, rho=0.9, eps=1e-6):
         lr = theano.shared(np.float32(lr).astype(floatX))
 
         gradients = self.get_gradients(cost, params)
-        accumulators = [theano.shared(np.zeros_like(p.get_value()).astype(np.float32)) for p in params]
+        accumulators = [theano.shared(np.zeros_like(p.get_value()).astype(floatX)) for p in params]
 
         updates = []
 
diff --git a/train.py b/train.py
@@ -138,7 +138,7 @@
 
 # Initialize model
 model = Model(parameters=parameters, models_path=models_path)
-logging.info("Model location: %s" % model.model_path)
+logging.info("Model location: %s", model.model_path)
 
 # Data parameters
 lower = parameters['lower']
@@ -155,22 +155,24 @@
 ##update_tag_scheme(dev_sentences, tag_scheme)
 ##update_tag_scheme(test_sentences, tag_scheme)
 
+all_sentences = train_sentences + dev_sentences + test_sentences
+
 # Create a dictionary / mapping of words
 # If we use pretrained embeddings, we add them to the dictionary.
 if parameters['pre_emb']:
-    dico_words_train = word_mapping(train_sentences, lower)[0]
+    dico_words_train = word_mapping(all_sentences, lower)[0]
     dico_words, word_to_id, id_to_word = augment_with_pretrained(
         dico_words_train.copy(),
         parameters['pre_emb'],
         None
     )
 else:
-    dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
+    dico_words, word_to_id, id_to_word = word_mapping(all_sentences, lower)
     dico_words_train = dico_words
 
 # Create a dictionary and a mapping for words / POS tags / tags
-dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
-dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)
+dico_chars, char_to_id, id_to_char = char_mapping(all_sentences)
+dico_tags, tag_to_id, id_to_tag = tag_mapping(all_sentences)
 
 # Index data
 train_data = prepare_dataset(
@@ -229,12 +231,12 @@
             logging.info("Score on dev: %.5f", dev_score)
             logging.info("Score on test: %.5f", test_score)
             if dev_score > best_dev:
+                logging.info("New best score on dev: %f. (Previously: %f)", dev_score, best_dev)
                 best_dev = dev_score
-                logging.info("New best score on dev.")
                 logging.info("Saving model to disk...")
                 model.save()
             if test_score > best_test:
+                logging.info("New best score on test: %f. (Previously: %f)", test_score, best_test)
                 best_test = test_score
-                logging.info("New best score on test.")
-    logging.info("Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs)))
+    logging.info("Epoch %i done. Average cost: %f", epoch, np.mean(epoch_costs))
 model.save()
diff --git a/utils.py b/utils.py
@@ -38,7 +38,7 @@ def set_values(name, param, pretrained):
         )
     param.set_value(np.reshape(
         pretrained, param_value.shape
-    ).astype(np.float32))
+    ).astype(theano.config.floatX))
 
 
 def shared(shape, name):

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ def set_values(name, param, pretrained):`
`38`	`38`	`)`
`39`	`39`	`param.set_value(np.reshape(`
`40`	`40`	`pretrained, param_value.shape`
`41`		`- ).astype(np.float32))`
	`41`	`+ ).astype(theano.config.floatX))`
`42`	`42`
`43`	`43`
`44`	`44`	`def shared(shape, name):`