diff --git a/loader.py b/loader.py index a7368685..2317b7fd 100644 --- a/loader.py +++ b/loader.py @@ -61,9 +61,7 @@ def word_mapping(sentences, lower): dico = create_dico(words) dico[''] = 10000000 word_to_id, id_to_word = create_mapping(dico) - print "Found %i unique words (%i in total)" % ( - len(dico), sum(len(x) for x in words) - ) + print("Found %i unique words (%i in total)" % ( len(dico), sum(len(x) for x in words) )) return dico, word_to_id, id_to_word @@ -74,7 +72,7 @@ def char_mapping(sentences): chars = ["".join([w[0] for w in s]) for s in sentences] dico = create_dico(chars) char_to_id, id_to_char = create_mapping(dico) - print "Found %i unique characters" % len(dico) + print("Found %i unique characters" % len(dico) ) return dico, char_to_id, id_to_char @@ -85,7 +83,7 @@ def tag_mapping(sentences): tags = [[word[-1] for word in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) - print "Found %i unique named entity tags" % len(dico) + print("Found %i unique named entity tags" % len(dico) ) return dico, tag_to_id, id_to_tag @@ -160,7 +158,7 @@ def augment_with_pretrained(dictionary, ext_emb_path, words): to the dictionary, otherwise, we only add the words that are given by `words` (typically the words in the development and test sets.) """ - print 'Loading pretrained embeddings from %s...' % ext_emb_path + print('Loading pretrained embeddings from %s...' % ext_emb_path) assert os.path.isfile(ext_emb_path) # Load pretrained embeddings from file @@ -188,3 +186,6 @@ def augment_with_pretrained(dictionary, ext_emb_path, words): word_to_id, id_to_word = create_mapping(dictionary) return dictionary, word_to_id, id_to_word + +if __name__ == "__main__": + print("testMain") diff --git a/model.py b/model.py index fca1ab48..9e29bb24 100644 --- a/model.py +++ b/model.py @@ -5,7 +5,7 @@ import theano import theano.tensor as T import codecs -import cPickle +import _pickle as cPickle from utils import shared, set_values, get_name from nn import HiddenLayer, EmbeddingLayer, DropoutLayer, LSTM, forward @@ -163,7 +163,7 @@ def build(self, # Initialize with pretrained embeddings if pre_emb and training: new_weights = word_layer.embeddings.get_value() - print 'Loading pretrained embeddings from %s...' % pre_emb + print('Loading pretrained embeddings from %s...' % pre_emb) pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): @@ -175,7 +175,7 @@ def build(self, else: emb_invalid += 1 if emb_invalid > 0: - print 'WARNING: %i invalid lines' % emb_invalid + print('WARNING: %i invalid lines' % emb_invalid) c_found = 0 c_lower = 0 c_zeros = 0 @@ -194,7 +194,7 @@ def build(self, ] c_zeros += 1 word_layer.embeddings.set_value(new_weights) - print 'Loaded %i pretrained embeddings.' % len(pretrained) + print('Loaded %i pretrained embeddings.' % len(pretrained)) print ('%i / %i (%.4f%%) words have been initialized with ' 'pretrained embeddings.') % ( c_found + c_lower + c_zeros, n_words, @@ -368,7 +368,7 @@ def build(self, lr_method_parameters = {} # Compile training function - print 'Compiling...' + print('Compiling...') if training: updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function( diff --git a/tagger.py b/tagger.py index 47b8f193..b50dbf90 100755 --- a/tagger.py +++ b/tagger.py @@ -8,6 +8,7 @@ from loader import prepare_sentence from utils import create_input, iobes_iob, zero_digits from model import Model +import pdb optparser = optparse.OptionParser() optparser.add_option( @@ -34,7 +35,7 @@ assert os.path.isfile(opts.input) # Load existing model -print "Loading model..." +print("Loading model...") model = Model(model_path=opts.model) parameters = model.parameters @@ -51,12 +52,12 @@ f_output = codecs.open(opts.output, 'w', 'utf-8') start = time.time() -print 'Tagging...' +print('Tagging...') with codecs.open(opts.input, 'r', 'utf-8') as f_input: count = 0 for line in f_input: words = line.rstrip().split() - if line: + if len(line.strip()): # Lowercase sentence if parameters['lower']: line = line.lower() @@ -84,7 +85,7 @@ f_output.write('\n') count += 1 if count % 100 == 0: - print count + print(count) -print '---- %i lines tagged in %.4fs ----' % (count, time.time() - start) +print('---- %i lines tagged in %.4fs ----' % (count, time.time() - start)) f_output.close() diff --git a/train.py b/train.py index 5dbe0436..816b569b 100755 --- a/train.py +++ b/train.py @@ -133,7 +133,7 @@ # Initialize model model = Model(parameters=parameters, models_path=models_path) -print "Model location: %s" % model.model_path +print("Model location: %s" % model.model_path) # Data parameters lower = parameters['lower'] @@ -180,11 +180,11 @@ test_sentences, word_to_id, char_to_id, tag_to_id, lower ) -print "%i / %i / %i sentences in train / dev / test." % ( - len(train_data), len(dev_data), len(test_data)) +print("%i / %i / %i sentences in train / dev / test." % ( + len(train_data), len(dev_data), len(test_data))) # Save the mappings to disk -print 'Saving the mappings to disk...' +print('Saving the mappings to disk...') model.save_mappings(id_to_word, id_to_char, id_to_tag) # Build the model @@ -192,7 +192,7 @@ # Reload previous model values if opts.reload: - print 'Reloading previous model...' + print('Reloading previous model...') model.reload() # @@ -207,27 +207,27 @@ count = 0 for epoch in xrange(n_epochs): epoch_costs = [] - print "Starting epoch %i..." % epoch + print("Starting epoch %i..." % epoch) for i, index in enumerate(np.random.permutation(len(train_data))): count += 1 input = create_input(train_data[index], parameters, True, singletons) new_cost = f_train(*input) epoch_costs.append(new_cost) if i % 50 == 0 and i > 0 == 0: - print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:])) + print("%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))) if count % freq_eval == 0: dev_score = evaluate(parameters, f_eval, dev_sentences, dev_data, id_to_tag, dico_tags) test_score = evaluate(parameters, f_eval, test_sentences, test_data, id_to_tag, dico_tags) - print "Score on dev: %.5f" % dev_score - print "Score on test: %.5f" % test_score + print("Score on dev: %.5f" % dev_score) + print("Score on test: %.5f" % test_score) if dev_score > best_dev: best_dev = dev_score - print "New best score on dev." - print "Saving model to disk..." + print("New best score on dev.") + print("Saving model to disk...") model.save() if test_score > best_test: best_test = test_score - print "New best score on test." - print "Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs)) + print("New best score on test.") + print("Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs))) diff --git a/utils.py b/utils.py index 19e3175d..d4852858 100644 --- a/utils.py +++ b/utils.py @@ -3,7 +3,7 @@ import codecs import numpy as np import theano - +import pdb models_path = "./models" eval_path = "./evaluation" @@ -177,6 +177,8 @@ def pad_word_chars(words): - padded list of lists of ints (where chars are reversed) - list of ints corresponding to the index of the last character of each word """ + print(words) + print('\n') max_length = max([len(word) for word in words]) char_for = [] char_rev = [] @@ -224,7 +226,6 @@ def evaluate(parameters, f_eval, raw_sentences, parsed_sentences, n_tags = len(id_to_tag) predictions = [] count = np.zeros((n_tags, n_tags), dtype=np.int32) - for raw_sentence, data in zip(raw_sentences, parsed_sentences): input = create_input(data, parameters, False) if parameters['crf']: @@ -255,28 +256,28 @@ def evaluate(parameters, f_eval, raw_sentences, parsed_sentences, # CoNLL evaluation results eval_lines = [l.rstrip() for l in codecs.open(scores_path, 'r', 'utf8')] for line in eval_lines: - print line + print(line) # Remove temp files # os.remove(output_path) # os.remove(scores_path) # Confusion matrix with accuracy for each tag - print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format( + print( ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format( "ID", "NE", "Total", *([id_to_tag[i] for i in xrange(n_tags)] + ["Percent"]) - ) + )) for i in xrange(n_tags): - print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format( + print( ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format( str(i), id_to_tag[i], str(count[i].sum()), *([count[i][j] for j in xrange(n_tags)] + ["%.3f" % (count[i][i] * 100. / max(1, count[i].sum()))]) - ) + )) # Global accuracy - print "%i/%i (%.5f%%)" % ( + print( "%i/%i (%.5f%%)" % ( count.trace(), count.sum(), 100. * count.trace() / max(1, count.sum()) - ) + )) # F1 on all entities return float(eval_lines[1].strip().split()[-1])