Fixed some issues

Ander Corral · Ander Corral · commit 705b94fbf10a · 2021-09-09T10:23:42.000+02:00
diff --git a/docs/source/FAQ.md b/docs/source/FAQ.md
@@ -498,6 +498,7 @@ A C C C C A A B
 **Notes**
 - Prior tokenization is not necessary, features will be inferred by using the `FeatInferTransform` transform.
 - `FilterFeatsTransform` and `FeatInferTransform` are required in order to ensure the functionality.
+- Not possible to do shared embeddings (at least with `feat_merge: concat` method)
 
 Sample config file:
 
@@ -529,10 +530,20 @@ feat_merge: "sum"
 
 ```
 
-During inference you can pass features by using the `--src_feats` argument. 
+During inference you can pass features by using the `--src_feats` argument. `src_feats` is expected to be a Python like dict, mapping feature name with its data file.
+
+```
+{'feat_0': '../data.txt.feats0', 'feat_1': '../data.txt.feats1'}
+```
 
 **Important note!** During inference, input sentence is expected to be tokenized. Therefore feature inferring should be handled prior to running the translate command. Example:
 
 ```bash
 python translate.py -model model_step_10.pt -src ../data.txt.tok -output ../data.out --src_feats "{'feat_0': '../data.txt.feats0', 'feat_1': '../data.txt.feats1'}"
 ```
+
+When using the Transformer arquitechture make sure the following options are appropiately set:
+
+- `src_word_vec_size` and `tgt_word_vec_size` or `word_vec_size`
+- `feat_merge`: how to handle features vecs
+- `feat_vec_size` and maybe `feat_vec_exponent`
diff --git a/onmt/inputters/corpus.py b/onmt/inputters/corpus.py
@@ -75,6 +75,7 @@ def _process(item, is_train):
         maybe_example['src'] = {"src": ' '.join(maybe_example['src'])}
 
         # Make features part of src as in MultiTextField
+        # {'src': {'src': ..., 'feat1': ...., 'feat2': ....}}
         if 'src_feats' in maybe_example:
             for feat_name, feat_value in maybe_example['src_feats'].items():
                 maybe_example['src'][feat_name] = ' '.join(feat_value)
@@ -328,12 +329,12 @@ def build_sub_vocab(corpora, transforms, opts, n_sample, stride, offset):
                 if opts.dump_samples:
                     build_sub_vocab.queues[c_name][offset].put("blank")
                 continue
-            src_line, tgt_line = maybe_example['src'], maybe_example['tgt']
+            src_line, tgt_line = maybe_example['src']['src'], maybe_example['tgt']['tgt']
             for feat_name, feat_line in maybe_example["src"].items():
                 if feat_name != "src":
                     sub_counter_src_feats[feat_name].update(feat_line.split(' '))
-            sub_counter_src.update(src_line["src"].split(' '))
-            sub_counter_tgt.update(tgt_line["tgt"].split(' '))
+            sub_counter_src.update(src_line.split(' '))
+            sub_counter_tgt.update(tgt_line.split(' '))
             if opts.dump_samples:
                 build_sub_vocab.queues[c_name][offset].put(
                     (i, src_line, tgt_line))
diff --git a/onmt/inputters/text_dataset.py b/onmt/inputters/text_dataset.py
@@ -17,6 +17,9 @@ def read(self, sequences, side, features={}):
                 path to text file or iterable of the actual text data.
             side (str): Prefix used in return dict. Usually
                 ``"src"`` or ``"tgt"``.
+            features: (Dict[str or Iterable[str]]):
+                dictionary mapping feature names with th path to feature
+                file or iterable of the actual feature data.
 
         Yields:
             dictionaries whose keys are the names of fields and whose
@@ -53,6 +56,7 @@ def text_sort_key(ex):
     return len(ex.src[0])
 
 
+# Legacy function. Currently it only truncates input if truncate is set.
 # mix this with partial
 def _feature_tokenize(
         string, layer=0, tok_delim=None, feat_delim=None, truncate=None):
diff --git a/onmt/opts.py b/onmt/opts.py
@@ -761,8 +761,8 @@ def translate_opts(parser):
               help="Source sequence to decode (one line per "
                    "sequence)")
     group.add("-src_feats", "--src_feats", required=False,
-              help="Source sequence features (one line per "
-                    "sequence)")
+              help="Source sequence features (dict format). "
+                    "Ex: {'feat_0': '../data.txt.feats0', 'feat_1': '../data.txt.feats1'}")
     group.add('--tgt', '-tgt',
               help='True target sequence (optional)')
     group.add('--tgt_prefix', '-tgt_prefix', action='store_true',
diff --git a/onmt/tests/test_subword_marker.py b/onmt/tests/test_subword_marker.py
@@ -41,12 +41,18 @@ def test_subword_group_joiner(self):
         out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER, case_markup=SubwordMarker.CASE_MARKUP)
         self.assertEqual(out, true_out)
 
-    def test_subword_group_joiner_with_markup(self):
+    def test_subword_group_joiner_with_case_markup(self):
         data_in = ['｟mrk_case_modifier_C｠', 'however', '￭,', 'according', 'to', 'the', 'logs', '￭,', '｟mrk_begin_case_region_U｠', 'she', 'is', 'hard', '￭-￭', 'working', '￭.', '｟mrk_end_case_region_U｠']  # noqa: E501
         true_out = [0, 0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 7, 7, 7, 7, 7]
         out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER, case_markup=SubwordMarker.CASE_MARKUP)
         self.assertEqual(out, true_out)
 
+    def test_subword_group_joiner_with_new_joiner(self):
+        data_in = ['｟mrk_case_modifier_C｠', 'however', '￭', ',', 'according', 'to', 'the', 'logs', '￭', ',', '｟mrk_begin_case_region_U｠', 'she', 'is', 'hard', '￭', '-', '￭', 'working', '￭', '.', '｟mrk_end_case_region_U｠']  # noqa: E501
+        true_out = [0, 0, 0, 0, 1, 2, 3, 4, 4, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7]
+        out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER, case_markup=SubwordMarker.CASE_MARKUP)
+        self.assertEqual(out, true_out)
+
     def test_subword_group_naive(self):
         data_in = ['however', ',', 'according', 'to', 'the', 'logs', ',', 'she', 'is', 'hard', '-', 'working', '.']  # noqa: E501
         true_out = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
@@ -63,6 +69,18 @@ def test_subword_group_spacer(self):
         no_dummy_out = subword_map_by_spacer(no_dummy)
         self.assertEqual(no_dummy_out, true_out)
 
+    def test_subword_group_spacer_with_case_markup(self):
+        data_in = ['｟mrk_case_modifier_C｠', '▁however', ',', '▁according', '▁to', '▁the', '▁logs', ',', '▁｟mrk_begin_case_region_U｠', '▁she', '▁is', '▁hard', '-', 'working', '.', '▁｟mrk_end_case_region_U｠']  # noqa: E501
+        true_out = [0, 0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 7, 7, 7, 7, 7]
+        out = subword_map_by_spacer(data_in)
+        self.assertEqual(out, true_out)
+
+    def test_subword_group_spacer_with_spacer_new(self):
+        data_in = ['｟mrk_case_modifier_C｠', '▁', 'however', ',', '▁', 'according', '▁', 'to', '▁', 'the', '▁', 'logs', ',', '▁', '｟mrk_begin_case_region_U｠', '▁', 'she', '▁', 'is', '▁', 'hard', '-', 'working', '.', '▁', '｟mrk_end_case_region_U｠']  # noqa: E501
+        true_out = [0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7]
+        out = subword_map_by_spacer(data_in)
+        self.assertEqual(out, true_out)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/onmt/transforms/features.py b/onmt/transforms/features.py
@@ -63,12 +63,9 @@ def apply(self, example, is_train=False, stats=None, **kwargs):
             # Do nothing
             return example
 
-        # TODO: support joiner_new or spacer_new options. Consistency not ensured currently
-
         if self.reversible_tokenization == "joiner":
             word_to_subword_mapping = subword_map_by_joiner(example["src"])
         else: #Spacer
-            # TODO: case markup
             word_to_subword_mapping = subword_map_by_spacer(example["src"])
 
         inferred_feats = defaultdict(list)
diff --git a/onmt/translate/translator.py b/onmt/translate/translator.py
@@ -346,6 +346,7 @@ def translate(
         Args:
             src: See :func:`self.src_reader.read()`.
             tgt: See :func:`self.tgt_reader.read()`.
+            src_feats: See :func`self.src_reader.read()`.
             batch_size (int): size of examples per mini-batch
             attn_debug (bool): enables the attention logging
             align_debug (bool): enables the word alignment logging
diff --git a/onmt/utils/alignment.py b/onmt/utils/alignment.py
@@ -134,9 +134,29 @@ def subword_map_by_joiner(subwords, marker=SubwordMarker.JOINER, case_markup=Sub
     return word_group
 
 
-def subword_map_by_spacer(subwords, marker=SubwordMarker.SPACER):
+def subword_map_by_spacer(subwords, marker=SubwordMarker.SPACER, case_markup=SubwordMarker.CASE_MARKUP):
     """Return word id for each subword token (annotate by spacer)."""
-    word_group = list(accumulate([int(marker in x) for x in subwords]))
+    flags = [0] * len(subwords)
+    for i, tok in enumerate(subwords):
+        if marker in tok:
+            if tok.replace(marker, "") in case_markup:
+                if i < len(subwords)-1: 
+                    flags[i] = 1
+            else:
+                if i > 0:
+                    previous = subwords[i-1].replace(marker, "")
+                    if previous not in case_markup:
+                        flags[i] = 1
+
+    # In case there is a final case_markup when new_spacer is on
+    for i in range(1,len(subwords)-1):
+        if subwords[-i] in case_markup:
+            flags[-i] = 0
+        elif subwords[-i] == marker:
+            flags[-i] = 0
+            break
+
+    word_group = list(accumulate(flags))
     if word_group[0] == 1:  # when dummy prefix is set
         word_group = [item - 1 for item in word_group]
     return word_group