Skip to content
This repository was archived by the owner on Nov 8, 2022. It is now read-only.

Commit 38aa83a

Browse files
authored
Improving ID-CNN/LSTM sequence tagger accuracy and distillation models (#131)
* add eval logging and allow cased embeddings * support bilou format * updated transformer token tagging to save best and support bilou * refactor pseudo/distillation procedures
1 parent 95a82da commit 38aa83a

File tree

16 files changed

+1348
-332
lines changed

16 files changed

+1348
-332
lines changed

nlp_architect/data/sequential_tagging.py

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -290,16 +290,25 @@ def _gen_data(self):
290290
class TokenClsInputExample(InputExample):
291291
"""A single training/test example for simple sequence token classification."""
292292

293-
def __init__(self, guid: str, text: str, tokens: List[str], label: List[str] = None):
293+
def __init__(
294+
self,
295+
guid: str,
296+
text: str,
297+
tokens: List[str],
298+
shapes: List[int] = None,
299+
label: List[str] = None,
300+
):
294301
"""Constructs a SequenceClassInputExample.
295302
Args:
296303
guid: Unique id for the example.
297304
text: string. The untokenized text of the sequence.
298305
tokens (List[str]): The list of tokens.
306+
shapes (List[str]): List of tokens shapes.
299307
label (List[str], optional): The tags of the tokens.
300308
"""
301309
super(TokenClsInputExample, self).__init__(guid, text, label)
302310
self.tokens = tokens
311+
self.shapes = shapes
303312

304313

305314
class TokenClsProcessor(DataProcessor):
@@ -309,12 +318,13 @@ class TokenClsProcessor(DataProcessor):
309318
Label dictionary is given in labels.txt file.
310319
"""
311320

312-
def __init__(self, data_dir, tag_col: int = -1):
321+
def __init__(self, data_dir, tag_col: int = -1, ignore_token=None):
313322
if not os.path.exists(data_dir):
314323
raise FileNotFoundError
315324
self.data_dir = data_dir
316325
self.tag_col = tag_col
317326
self.labels = None
327+
self.ignore_token = ignore_token
318328

319329
def _read_examples(self, data_dir, file_name, set_name):
320330
if not os.path.exists(data_dir + os.sep + file_name):
@@ -325,7 +335,11 @@ def _read_examples(self, data_dir, file_name, set_name):
325335
)
326336
return None
327337
return self._create_examples(
328-
read_column_tagged_file(os.path.join(data_dir, file_name), tag_col=self.tag_col),
338+
read_column_tagged_file(
339+
os.path.join(data_dir, file_name),
340+
tag_col=self.tag_col,
341+
ignore_token=self.ignore_token,
342+
),
329343
set_name,
330344
)
331345

@@ -359,19 +373,31 @@ def get_labels_filename():
359373
return "labels.txt"
360374

361375
@staticmethod
362-
def _create_examples(lines, set_type):
376+
def _get_shape(string):
377+
if all(c.isupper() for c in string):
378+
return 1 # "AA"
379+
if string[0].isupper():
380+
return 2 # "Aa"
381+
if any(c for c in string if c.isupper()):
382+
return 3 # "aAa"
383+
return 4 # "a"
384+
385+
@classmethod
386+
def _create_examples(cls, lines, set_type):
363387
"""See base class."""
364388
examples = []
365389
for i, (sentence, labels) in enumerate(lines):
366390
guid = "%s-%s" % (set_type, i)
367391
text = " ".join(sentence)
392+
shapes = [cls._get_shape(w) for w in sentence]
368393
examples.append(
369-
TokenClsInputExample(guid=guid, text=text, tokens=sentence, label=labels)
394+
TokenClsInputExample(
395+
guid=guid, text=text, tokens=sentence, label=labels, shapes=shapes
396+
)
370397
)
371398
return examples
372399

373-
def get_vocabulary(self):
374-
examples = self.get_train_examples() + self.get_dev_examples() + self.get_test_examples()
400+
def get_vocabulary(self, examples: TokenClsInputExample = None):
375401
vocab = Vocabulary(start=1)
376402
for e in examples:
377403
for t in e.tokens:

nlp_architect/data/utils.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,13 @@ def read_tsv(input_file, quotechar=None):
9494
return lines
9595

9696

97-
def read_column_tagged_file(filename: str, tag_col: int = -1):
97+
def read_column_tagged_file(filename: str, tag_col: int = -1, ignore_token: str = None):
9898
"""Reads column tagged (CONLL) style file (tab separated and token per line)
9999
tag_col is the column number to use as tag of the token (defualts to the last in line)
100+
Args:
101+
filename (str): input file path
102+
tag_col (int): the column contains the labels
103+
ignore_token (str): a str token to exclude
100104
return format :
101105
[ ['token', 'TAG'], ['token', 'TAG2'],... ]
102106
"""
@@ -113,8 +117,10 @@ def read_column_tagged_file(filename: str, tag_col: int = -1):
113117
labels = []
114118
continue
115119
splits = line.split()
116-
sentence.append(splits[0])
117-
labels.append(splits[tag_col])
120+
token = splits[0]
121+
if token != ignore_token:
122+
sentence.append(token)
123+
labels.append(splits[tag_col])
118124

119125
if len(sentence) > 0:
120126
data.append((sentence, labels))
@@ -166,22 +172,23 @@ def split_column_dataset(
166172
"""
167173
Splits a single column tagged dataset into two files according to the amount of examples
168174
requested to be included in each file.
169-
split1_count (int) : the amount of examples to include in the first split file
170-
split2_count (int) : the amount of examples to include in the second split file
175+
first_count (int) : the amount of examples to include in the first split file
176+
second_count (int) : the amount of examples to include in the second split file
171177
out_folder (str) : the folder in which the result files will be stored
172178
dataset (str) : the path to the original data file
173-
split1_filename (str) : the name of the first split file
174-
split2_filename (str) : the name of the second split file
179+
first_filename (str) : the name of the first split file
180+
second_filename (str) : the name of the second split file
175181
tag_col (int) : the index of the tag column
176182
"""
177183
lines = read_column_tagged_file(dataset, tag_col=tag_col)
178184
num_of_examples = len(lines)
179-
assert first_count + second_count <= num_of_examples and first_count > 0 and second_count > 0
185+
assert first_count + second_count <= num_of_examples and first_count > 0
180186
selected_lines = random.sample(lines, first_count + second_count)
181187
first_data = selected_lines[:first_count]
182188
second_data = selected_lines[first_count:]
183189
write_column_tagged_file(out_folder + os.sep + first_filename, first_data)
184-
write_column_tagged_file(out_folder + os.sep + second_filename, second_data)
190+
if second_count != 0:
191+
write_column_tagged_file(out_folder + os.sep + second_filename, second_data)
185192

186193

187194
def get_cached_filepath(data_dir, model_name, seq_length, task_name, set_type="train"):

0 commit comments

Comments
 (0)