IMDB annotation for XLNet accuracy test fixes. (openvinotoolkit#831)

mzuevx · eldercrow · commit 50ecdde9f5d9 · 2020-02-18T22:27:23.000+09:00
* fix path for Sentence Piece model loading (PosixPath-&gt;str)
* fix method name (PieceTold -&gt; PieceToId)
* fix convert_single_example() to be able  to work without sample.text_b
diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/_nlp_common.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/_nlp_common.py
@@ -257,7 +257,7 @@ def __init__(self, tokenizer_model, lower_case=True, remove_space=True):
         if spm is None:
             raise ConfigError('Sentence piece tokenizer required sentencepiece, please install it before usage')
         self.encoder = spm.SentencePieceProcessor()
-        self.encoder.Load(tokenizer_model)
+        self.encoder.Load(str(tokenizer_model))
         self.lower_case = lower_case
         self.remove_space = remove_space
 
@@ -275,7 +275,7 @@ def preprocess_text(self, inputs):
 
     def encode_ids(self, text, sample=False):
         pieces = self.encode_pieces(text, sample)
-        ids = [self.encoder.PieceTold(piece) for piece in pieces]
+        ids = [self.encoder.PieceToId(piece) for piece in pieces]
         return ids
 
     def encode_pieces(self, text, sample=False):
diff --git a/tools/accuracy_checker/accuracy_checker/annotation_converters/text_classification.py b/tools/accuracy_checker/accuracy_checker/annotation_converters/text_classification.py
@@ -73,8 +73,19 @@ def convert_single_example(self, example):
             'segment_ids_{}'.format(example.guid)
         ]
         tokens_a = self.tokenizer.tokenize(example.text_a)
-        tokens_b = self.tokenizer.tokenize(example.text_b)
-        truncate_seq_pair(tokens_a, tokens_b, self.max_seq_length - 3)
+        tokens_b = None
+        if example.text_b:
+            tokens_b = self.tokenizer.tokenize(example.text_b if example.text_b is not None else '')
+
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for two [SEP] & one [CLS] with "- 3"
+            truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for one [SEP] & one [CLS] with "- 2"
+            if len(tokens_a) > self.max_seq_length - 2:
+                tokens_a = tokens_a[:self.max_seq_length - 2]
 
         tokens = []
         segment_ids = []
@@ -83,11 +94,13 @@ def convert_single_example(self, example):
             segment_ids.append(SEG_ID_A)
         tokens.append('[SEP]' if self.support_vocab else SEP_ID)
         segment_ids.append(SEG_ID_A)
-        for token in tokens_b:
-            tokens.append(token)
+
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                segment_ids.append(SEG_ID_B)
+            tokens.append('[SEP]' if self.support_vocab else SEP_ID)
             segment_ids.append(SEG_ID_B)
-        tokens.append('[SEP]' if self.support_vocab else SEP_ID)
-        segment_ids.append(SEG_ID_B)
 
         tokens.append("[CLS]" if self.support_vocab else CLS_ID)
         segment_ids.append(SEG_ID_CLS)