Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import codecs
from utils import create_dico, create_mapping, zero_digits
from utils import iob2, iob_iobes
import numpy as np


def load_sentences(path, lower, zeros):
Expand All @@ -26,6 +27,7 @@ def load_sentences(path, lower, zeros):
if len(sentence) > 0:
if 'DOCSTART' not in sentence[0][0]:
sentences.append(sentence)

return sentences


Expand Down Expand Up @@ -188,3 +190,46 @@ def augment_with_pretrained(dictionary, ext_emb_path, words):

word_to_id, id_to_word = create_mapping(dictionary)
return dictionary, word_to_id, id_to_word

def load_gazetteers(gaz_path):
"""
Load extra gazetteers.
each line consists of two fields
gazetteer<tab><category>
we could have same gazetteer with multiple categories, but they should appear in
different lines
"""
lines = [line.rstrip() for line in open(gaz_path, 'r')]
gazetteers = {}
tags = {}
for line in lines:
split = line.split("\t")
assert len(split) >= 2
category, gazetteer = " ".join(split[-1:]), " ".join(split[:-1])
if category not in tags:
tags[category] = 1

if gazetteer not in gazetteers:
gazetteers[gazetteer] = []
gazetteers[gazetteer].append(category)
else:
cat = gazetteers[gazetteer]
if category not in cat:
gazetteers[gazetteer].append(category)
return gazetteers,tags.keys()

def add_gazetteers(data, token_to_gazetteers, index_to_token,gaz_tags):
"""
Add gazetteers features to data.
"""
#tags = ['LOC', 'MISC', 'ORG', 'PER']
tags = gaz_tags
for sentence in data:
sentence['gazetteers'] = [np.zeros(len(tags), dtype=np.int32) for _ in xrange(len(sentence['words']))]
for i, token_id in enumerate(sentence['words']):
token = index_to_token[token_id].lower()
if token in token_to_gazetteers:
values = set(token_to_gazetteers[token])
for j,tag in enumerate(tags):
if tag in values:
np.put(sentence['gazetteers'][i],[j],[1])
25 changes: 23 additions & 2 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def build(self,
lr_method,
pre_emb,
crf,
gaz_dim,
cap_dim,
training=True,
**kwargs
Expand All @@ -135,7 +136,12 @@ def build(self,
if cap_dim:
n_cap = 4

# Network variables
#Gaz features
if gaz_path:
n_gaz = self.parameters['gaz_dim'] # ner tags
gaz_dim = n_gaz

#Network variables
is_train = T.iscalar('is_train')
word_ids = T.ivector(name='word_ids')
char_for_ids = T.imatrix(name='char_for_ids')
Expand All @@ -144,7 +150,8 @@ def build(self,
tag_ids = T.ivector(name='tag_ids')
if cap_dim:
cap_ids = T.ivector(name='cap_ids')

if gaz_path:
gaz_values = T.imatrix(name='gaz_values')
# Sentence length
s_len = (word_ids if word_dim else char_pos_ids).shape[0]

Expand Down Expand Up @@ -240,6 +247,12 @@ def build(self,
cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
inputs.append(cap_layer.link(cap_ids))

#gaz features
if gaz_path:
input_dim += gaz_dim
gaz_layer = EmbeddingLayer(n_gaz, gaz_dim, name='gaz_layer')
inputs.append(gaz_layer.link(gaz_values))

# Prepare final input
if len(inputs) != 1:
inputs = T.concatenate(inputs, axis=1)
Expand Down Expand Up @@ -335,6 +348,12 @@ def build(self,
params.extend(cap_layer.params)
self.add_component(final_layer)
params.extend(final_layer.params)
# Gazetteers features
if gaz_path:
self.add_component(gaz_layer)
#experiment.components['gaz_embeddings'] = gaz_embeddings
params.extend(gaz_layer.params)

if crf:
self.add_component(transitions)
params.append(transitions)
Expand All @@ -353,6 +372,8 @@ def build(self,
eval_inputs.append(char_pos_ids)
if cap_dim:
eval_inputs.append(cap_ids)
if gaz_path:
eval_inputs.append(gaz_values)
train_inputs = eval_inputs + [tag_ids]

# Parse optimization method parameters
Expand Down
24 changes: 22 additions & 2 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@
"-r", "--reload", default="0",
type='int', help="Reload the last saved model"
)
optparser.add_option(
"-G", "--gaz_path",default="",
help="Gazetteers dimension"
)
optparser.add_option(
"-g", "--gaz_dim", default="5",
help="Gazetteers dimension"
)
opts = optparser.parse_args()[0]

# Parse parameters
Expand All @@ -111,7 +119,8 @@
parameters['crf'] = opts.crf == 1
parameters['dropout'] = opts.dropout
parameters['lr_method'] = opts.lr_method

parameters['gaz_dim'] = opts.gaz_dim
parameters['gaz_path'] = opts.gaz_path
# Check parameters validity
assert os.path.isfile(opts.train)
assert os.path.isfile(opts.dev)
Expand All @@ -122,7 +131,7 @@
assert not parameters['all_emb'] or parameters['pre_emb']
assert not parameters['pre_emb'] or parameters['word_dim'] > 0
assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb'])

assert not parameters['gaz_path'] or os.path.isfile(parameters['gaz_path'])
# Check evaluation script / folders
if not os.path.isfile(eval_script):
raise Exception('CoNLL evaluation script not found at "%s"' % eval_script)
Expand Down Expand Up @@ -183,6 +192,17 @@
print "%i / %i / %i sentences in train / dev / test." % (
len(train_data), len(dev_data), len(test_data))

if parameters['gaz_dim']:
'''1: read from gazetteers file with the format: <gazeetteer <list of categories>>
2: once we read the gazetteers, we create a one-hot-encoding gazetteer vector
for every word in the sentence. The length of vector is equal to no of categories
and add the gazetteer feature vector for every word
'''
gazetteers_dataset,gaz_tags = loader.load_gazetteers(parameters['gaz_path'])
parameters['gaz_dim'] = len(gaz_tags)
loader.add_gazetteers(train_data, gazetteers_dataset, id_to_word,gaz_tags)
loader.add_gazetteers(dev_data, gazetteers_dataset, id_to_word,gaz_tags)
loader.add_gazetteers(test_data, gazetteers_dataset, id_to_word,gaz_tags)
# Save the mappings to disk
print 'Saving the mappings to disk...'
model.save_mappings(id_to_word, id_to_char, id_to_tag)
Expand Down
4 changes: 4 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,8 @@ def create_input(data, parameters, add_label, singletons=None):
chars = data['chars']
if singletons is not None:
words = insert_singletons(words, singletons)
if parameters['gaz_path']:
gaz = data['gazetteers']
if parameters['cap_dim']:
caps = data['caps']
char_for, char_rev, char_pos = pad_word_chars(chars)
Expand All @@ -213,6 +215,8 @@ def create_input(data, parameters, add_label, singletons=None):
input.append(caps)
if add_label:
input.append(data['tags'])
if parameters['gaz_path']:
input.append(gaz)
return input


Expand Down