IntelLabs
diff --git a/‎nlp_architect/data/sequential_tagging.py‎
Lines changed: 33 additions & 7 deletions b/‎nlp_architect/data/sequential_tagging.py‎
Lines changed: 33 additions & 7 deletions
diff --git a/‎nlp_architect/data/utils.py‎
Lines changed: 16 additions & 9 deletions b/‎nlp_architect/data/utils.py‎
Lines changed: 16 additions & 9 deletions
@@ -290,16 +290,25 @@ def _gen_data(self):
 class TokenClsInputExample(InputExample):
     """A single training/test example for simple sequence token classification."""
 
-    def __init__(self, guid: str, text: str, tokens: List[str], label: List[str] = None):
+    def __init__(
+        self,
+        guid: str,
+        text: str,
+        tokens: List[str],
+        shapes: List[int] = None,
+        label: List[str] = None,
+    ):
         """Constructs a SequenceClassInputExample.
         Args:
             guid: Unique id for the example.
             text: string. The untokenized text of the sequence.
             tokens (List[str]): The list of tokens.
+            shapes (List[str]): List of tokens shapes.
             label (List[str], optional): The tags of the tokens.
         """
         super(TokenClsInputExample, self).__init__(guid, text, label)
         self.tokens = tokens
+        self.shapes = shapes
 
 
 class TokenClsProcessor(DataProcessor):
@@ -309,12 +318,13 @@ class TokenClsProcessor(DataProcessor):
     Label dictionary is given in labels.txt file.
     """
 
-    def __init__(self, data_dir, tag_col: int = -1):
+    def __init__(self, data_dir, tag_col: int = -1, ignore_token=None):
         if not os.path.exists(data_dir):
             raise FileNotFoundError
         self.data_dir = data_dir
         self.tag_col = tag_col
         self.labels = None
+        self.ignore_token = ignore_token
 
     def _read_examples(self, data_dir, file_name, set_name):
         if not os.path.exists(data_dir + os.sep + file_name):
@@ -325,7 +335,11 @@ def _read_examples(self, data_dir, file_name, set_name):
             )
             return None
         return self._create_examples(
-            read_column_tagged_file(os.path.join(data_dir, file_name), tag_col=self.tag_col),
+            read_column_tagged_file(
+                os.path.join(data_dir, file_name),
+                tag_col=self.tag_col,
+                ignore_token=self.ignore_token,
+            ),
             set_name,
         )
 
@@ -359,19 +373,31 @@ def get_labels_filename():
         return "labels.txt"
 
     @staticmethod
-    def _create_examples(lines, set_type):
+    def _get_shape(string):
+        if all(c.isupper() for c in string):
+            return 1  # "AA"
+        if string[0].isupper():
+            return 2  # "Aa"
+        if any(c for c in string if c.isupper()):
+            return 3  # "aAa"
+        return 4  # "a"
+
+    @classmethod
+    def _create_examples(cls, lines, set_type):
         """See base class."""
         examples = []
         for i, (sentence, labels) in enumerate(lines):
             guid = "%s-%s" % (set_type, i)
             text = " ".join(sentence)
+            shapes = [cls._get_shape(w) for w in sentence]
             examples.append(
-                TokenClsInputExample(guid=guid, text=text, tokens=sentence, label=labels)
+                TokenClsInputExample(
+                    guid=guid, text=text, tokens=sentence, label=labels, shapes=shapes
+                )
             )
         return examples
 
-    def get_vocabulary(self):
-        examples = self.get_train_examples() + self.get_dev_examples() + self.get_test_examples()
+    def get_vocabulary(self, examples: TokenClsInputExample = None):
         vocab = Vocabulary(start=1)
         for e in examples:
             for t in e.tokens:
 
@@ -94,9 +94,13 @@ def read_tsv(input_file, quotechar=None):
         return lines
 
 
-def read_column_tagged_file(filename: str, tag_col: int = -1):
+def read_column_tagged_file(filename: str, tag_col: int = -1, ignore_token: str = None):
     """Reads column tagged (CONLL) style file (tab separated and token per line)
     tag_col is the column number to use as tag of the token (defualts to the last in line)
+    Args:
+        filename (str): input file path
+        tag_col (int): the column contains the labels
+        ignore_token (str): a str token to exclude
     return format :
     [ ['token', 'TAG'], ['token', 'TAG2'],... ]
     """
@@ -113,8 +117,10 @@ def read_column_tagged_file(filename: str, tag_col: int = -1):
                     labels = []
                 continue
             splits = line.split()
-            sentence.append(splits[0])
-            labels.append(splits[tag_col])
+            token = splits[0]
+            if token != ignore_token:
+                sentence.append(token)
+                labels.append(splits[tag_col])
 
     if len(sentence) > 0:
         data.append((sentence, labels))
@@ -166,22 +172,23 @@ def split_column_dataset(
     """
     Splits a single column tagged dataset into two files according to the amount of examples
     requested to be included in each file.
-    split1_count (int) : the amount of examples to include in the first split file
-    split2_count (int) : the amount of examples to include in the second split file
+    first_count (int) : the amount of examples to include in the first split file
+    second_count (int) : the amount of examples to include in the second split file
     out_folder (str) : the folder in which the result files will be stored
     dataset (str) : the path to the original data file
-    split1_filename (str) : the name of the first split file
-    split2_filename (str) : the name of the second split file
+    first_filename (str) : the name of the first split file
+    second_filename (str) : the name of the second split file
     tag_col (int) : the index of the tag column
     """
     lines = read_column_tagged_file(dataset, tag_col=tag_col)
     num_of_examples = len(lines)
-    assert first_count + second_count <= num_of_examples and first_count > 0 and second_count > 0
+    assert first_count + second_count <= num_of_examples and first_count > 0
     selected_lines = random.sample(lines, first_count + second_count)
     first_data = selected_lines[:first_count]
     second_data = selected_lines[first_count:]
     write_column_tagged_file(out_folder + os.sep + first_filename, first_data)
-    write_column_tagged_file(out_folder + os.sep + second_filename, second_data)
+    if second_count != 0:
+        write_column_tagged_file(out_folder + os.sep + second_filename, second_data)
 
 
 def get_cached_filepath(data_dir, model_name, seq_length, task_name, set_type="train"):