diff --git a/loader.py b/loader.py index a7368685..bbf9829e 100644 --- a/loader.py +++ b/loader.py @@ -3,6 +3,7 @@ import codecs from utils import create_dico, create_mapping, zero_digits from utils import iob2, iob_iobes +import numpy as np def load_sentences(path, lower, zeros): @@ -26,6 +27,7 @@ def load_sentences(path, lower, zeros): if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) + return sentences @@ -188,3 +190,46 @@ def augment_with_pretrained(dictionary, ext_emb_path, words): word_to_id, id_to_word = create_mapping(dictionary) return dictionary, word_to_id, id_to_word + +def load_gazetteers(gaz_path): + """ + Load extra gazetteers. + each line consists of two fields + gazetteer + we could have same gazetteer with multiple categories, but they should appear in + different lines + """ + lines = [line.rstrip() for line in open(gaz_path, 'r')] + gazetteers = {} + tags = {} + for line in lines: + split = line.split("\t") + assert len(split) >= 2 + category, gazetteer = " ".join(split[-1:]), " ".join(split[:-1]) + if category not in tags: + tags[category] = 1 + + if gazetteer not in gazetteers: + gazetteers[gazetteer] = [] + gazetteers[gazetteer].append(category) + else: + cat = gazetteers[gazetteer] + if category not in cat: + gazetteers[gazetteer].append(category) + return gazetteers,tags.keys() + +def add_gazetteers(data, token_to_gazetteers, index_to_token,gaz_tags): + """ + Add gazetteers features to data. + """ + #tags = ['LOC', 'MISC', 'ORG', 'PER'] + tags = gaz_tags + for sentence in data: + sentence['gazetteers'] = [np.zeros(len(tags), dtype=np.int32) for _ in xrange(len(sentence['words']))] + for i, token_id in enumerate(sentence['words']): + token = index_to_token[token_id].lower() + if token in token_to_gazetteers: + values = set(token_to_gazetteers[token]) + for j,tag in enumerate(tags): + if tag in values: + np.put(sentence['gazetteers'][i],[j],[1]) diff --git a/model.py b/model.py index fca1ab48..8e7fba56 100644 --- a/model.py +++ b/model.py @@ -119,6 +119,7 @@ def build(self, lr_method, pre_emb, crf, + gaz_dim, cap_dim, training=True, **kwargs @@ -135,7 +136,12 @@ def build(self, if cap_dim: n_cap = 4 - # Network variables + #Gaz features + if gaz_path: + n_gaz = self.parameters['gaz_dim'] # ner tags + gaz_dim = n_gaz + + #Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') char_for_ids = T.imatrix(name='char_for_ids') @@ -144,7 +150,8 @@ def build(self, tag_ids = T.ivector(name='tag_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') - + if gaz_path: + gaz_values = T.imatrix(name='gaz_values') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] @@ -240,6 +247,12 @@ def build(self, cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) + #gaz features + if gaz_path: + input_dim += gaz_dim + gaz_layer = EmbeddingLayer(n_gaz, gaz_dim, name='gaz_layer') + inputs.append(gaz_layer.link(gaz_values)) + # Prepare final input if len(inputs) != 1: inputs = T.concatenate(inputs, axis=1) @@ -335,6 +348,12 @@ def build(self, params.extend(cap_layer.params) self.add_component(final_layer) params.extend(final_layer.params) + # Gazetteers features + if gaz_path: + self.add_component(gaz_layer) + #experiment.components['gaz_embeddings'] = gaz_embeddings + params.extend(gaz_layer.params) + if crf: self.add_component(transitions) params.append(transitions) @@ -353,6 +372,8 @@ def build(self, eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) + if gaz_path: + eval_inputs.append(gaz_values) train_inputs = eval_inputs + [tag_ids] # Parse optimization method parameters diff --git a/train.py b/train.py index 5dbe0436..cd1242a0 100755 --- a/train.py +++ b/train.py @@ -92,6 +92,14 @@ "-r", "--reload", default="0", type='int', help="Reload the last saved model" ) +optparser.add_option( + "-G", "--gaz_path",default="", + help="Gazetteers dimension" +) +optparser.add_option( + "-g", "--gaz_dim", default="5", + help="Gazetteers dimension" +) opts = optparser.parse_args()[0] # Parse parameters @@ -111,7 +119,8 @@ parameters['crf'] = opts.crf == 1 parameters['dropout'] = opts.dropout parameters['lr_method'] = opts.lr_method - +parameters['gaz_dim'] = opts.gaz_dim +parameters['gaz_path'] = opts.gaz_path # Check parameters validity assert os.path.isfile(opts.train) assert os.path.isfile(opts.dev) @@ -122,7 +131,7 @@ assert not parameters['all_emb'] or parameters['pre_emb'] assert not parameters['pre_emb'] or parameters['word_dim'] > 0 assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb']) - +assert not parameters['gaz_path'] or os.path.isfile(parameters['gaz_path']) # Check evaluation script / folders if not os.path.isfile(eval_script): raise Exception('CoNLL evaluation script not found at "%s"' % eval_script) @@ -183,6 +192,17 @@ print "%i / %i / %i sentences in train / dev / test." % ( len(train_data), len(dev_data), len(test_data)) +if parameters['gaz_dim']: + '''1: read from gazetteers file with the format: > + 2: once we read the gazetteers, we create a one-hot-encoding gazetteer vector + for every word in the sentence. The length of vector is equal to no of categories + and add the gazetteer feature vector for every word + ''' + gazetteers_dataset,gaz_tags = loader.load_gazetteers(parameters['gaz_path']) + parameters['gaz_dim'] = len(gaz_tags) + loader.add_gazetteers(train_data, gazetteers_dataset, id_to_word,gaz_tags) + loader.add_gazetteers(dev_data, gazetteers_dataset, id_to_word,gaz_tags) + loader.add_gazetteers(test_data, gazetteers_dataset, id_to_word,gaz_tags) # Save the mappings to disk print 'Saving the mappings to disk...' model.save_mappings(id_to_word, id_to_char, id_to_tag) diff --git a/utils.py b/utils.py index 19e3175d..224cf968 100644 --- a/utils.py +++ b/utils.py @@ -198,6 +198,8 @@ def create_input(data, parameters, add_label, singletons=None): chars = data['chars'] if singletons is not None: words = insert_singletons(words, singletons) + if parameters['gaz_path']: + gaz = data['gazetteers'] if parameters['cap_dim']: caps = data['caps'] char_for, char_rev, char_pos = pad_word_chars(chars) @@ -213,6 +215,8 @@ def create_input(data, parameters, add_label, singletons=None): input.append(caps) if add_label: input.append(data['tags']) + if parameters['gaz_path']: + input.append(gaz) return input