PaddlePaddle
diff --git a/‎examples/model_interpretation/evaluation/accuracy/cal_acc.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/model_interpretation/evaluation/accuracy/cal_acc.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/model_interpretation/punctuations‎
Lines changed: 82 additions & 0 deletions b/‎examples/model_interpretation/punctuations‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎examples/model_interpretation/rationale_extraction/generate_evaluation_data.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/model_interpretation/rationale_extraction/generate_evaluation_data.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/model_interpretation/rationale_extraction/run_2_pred_senti_per.sh‎
Lines changed: 1 addition & 1 deletion b/‎examples/model_interpretation/rationale_extraction/run_2_pred_senti_per.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/model_interpretation/rationale_extraction/sentiment_pred.py‎
Lines changed: 11 additions & 31 deletions b/‎examples/model_interpretation/rationale_extraction/sentiment_pred.py‎
Lines changed: 11 additions & 31 deletions
diff --git a/‎examples/model_interpretation/rationale_extraction/similarity_pred.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/model_interpretation/rationale_extraction/similarity_pred.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/model_interpretation/task/mrc/saliency_map/rc_finetune.py‎
Lines changed: 11 additions & 5 deletions b/‎examples/model_interpretation/task/mrc/saliency_map/rc_finetune.py‎
Lines changed: 11 additions & 5 deletions
@@ -68,6 +68,8 @@ def cal_acc(golden_label, pred_label):
     """
     acc = 0.0
     for ids in pred_label:
+        if ids not in golden_label:
+            continue
         if pred_label[ids] == golden_label[ids]:
             acc += 1
     if len(golden_label):
 
@@ -0,0 +1,82 @@
+”
+。
+,
+∈
+]
+√
+ 
+!
+(
+≥
+【
+“
+「
+÷
+《
+】
+！
+ˊ
+」
+.
+_
+@
+~
+–
+〕
+∶
+）
+’
+℃
+》
+〈
+→
+、
+＋
+|
+；
+：
+∠
+'
+‘
+，
+？
+×
+△
+－
+•
+·
+—
+°
+>
+′
+●
+;
+…
+"
+Ⅱ
+/
+<
++
+＝
+^
+Ⅰ
+?
+[
+﹑
+﹐
+*
+〔
+～
+:
+（
+)
+〉
+◎
+=
+-
+\
+%
+％
+&
+≠
+．
@@ -91,7 +91,7 @@ def r_data_generation(args, evids, text_dict_list, text_exclusive_dict_list,
             'context_idx']
         if len(temp['rationale']) > 1 and \
             args.inter_mode != 'lime' and \
-            not (args.language == 'en' and args.base_model.startswith('roberta')):
+            not (args.base_model.startswith('roberta')):
             for i in range(len(temp['rationale'][1])):
                 temp['rationale'][1][i] -= len(temp['rationale'][0]) + len(temp[
                     'no_rationale'][0])
 
@@ -40,7 +40,7 @@ do
             CKPT=../task/${TASK}/pretrained_models/saved_model_ch/roberta_large_20220318_170123/model_900/model_state.pdparams
             #CKPT=../../../${TASK}/pretrained_models/saved_model_ch/roberta_large_20211207_143351/model_900/model_state.pdparams
         elif [[ $BASE_MODEL == "lstm" ]]; then
-            VOCAB_PATH=../task/${TASK}/rnn
+            VOCAB_PATH=../task/${TASK}/rnn/vocab.txt
             CKPT=../task/${TASK}/rnn/checkpoints_ch/final.pdparams
         fi
     fi
 
@@ -182,20 +182,10 @@ def truncate_offset(seg, start_offset, end_offset):
 
 
 def init_lstm_var(args):
-    #different language has different tokenizer
-    if args.language == "ch":
-        tokenizer = ErnieTokenizer.from_pretrained(args.vocab_path)
-        padding_idx = tokenizer.vocab.get('[PAD]')
-        tokenizer.inverse_vocab = [
-            item[0]
-            for item in sorted(
-                tokenizer.vocab.items(), key=lambda x: x[1])
-        ]
-    else:
-        vocab = Vocab.load_vocabulary(
-            args.vocab_path, unk_token='[UNK]', pad_token='[PAD]')
-        tokenizer = CharTokenizer(vocab)
-        padding_idx = vocab.token_to_idx.get('[PAD]', 0)
+    vocab = Vocab.load_vocabulary(
+        args.vocab_path, unk_token='[UNK]', pad_token='[PAD]')
+    tokenizer = CharTokenizer(vocab, args.language, '../punctuations')
+    padding_idx = vocab.token_to_idx.get('[PAD]', 0)
 
     trans_fn = partial(
         convert_example,
@@ -299,23 +289,13 @@ def init_roberta_var(args):
                     input_ids[0, 1:-1].tolist())  # list
 
             elif args.base_model == 'lstm':
-                if args.language == 'ch':
-                    input_ids, seq_lens = d
-                    #input_ids = paddle.to_tensor([input_ids[0][0]])
-                    fwd_args = [input_ids, seq_lens]
-                    fwd_kwargs = {}
-                    tokens = [
-                        tokenizer.inverse_vocab[input_id]
-                        for input_id in input_ids.tolist()[0]
-                    ]
-                else:
-                    input_ids, seq_lens = d
-                    fwd_args = [input_ids, seq_lens]
-                    fwd_kwargs = {}
-                    tokens = [
-                        tokenizer.vocab.idx_to_token[input_id]
-                        for input_id in input_ids.tolist()[0]
-                    ]
+                input_ids, seq_lens = d
+                fwd_args = [input_ids, seq_lens]
+                fwd_kwargs = {}
+                tokens = [
+                    tokenizer.vocab.idx_to_token[input_id]
+                    for input_id in input_ids.tolist()[0]
+                ]
 
             result['id'] = dataloader.dataset.data[step]['id']
 
 
@@ -186,7 +186,7 @@ def init_lstm_var(args):
             unk_token='[UNK]',
             pad_token='[PAD]')
 
-    tokenizer = CharTokenizer(vocab, language=args.language)
+    tokenizer = CharTokenizer(vocab, args.language, '../punctuations')
     model = SimNet(network='lstm', vocab_size=len(vocab), num_classes=2)
 
     dev_ds = SimilarityData().read(os.path.join(args.data_dir, 'dev'))
 
@@ -150,7 +150,17 @@ def map_fn_DuCheckList_finetune(examples):
             # Start/end character index of the answer in the text.
             start_char = answer_starts[0]
             end_char = start_char + len(answers[0])
-            if args.language == 'ch':
+            if args.language == 'en':
+                # Start token index of the current span in the text.
+                token_start_index = 0
+                while not (offsets[token_start_index] ==
+                           (0, 0) and offsets[token_start_index + 1] == (0, 0)):
+                    token_start_index += 1
+                token_start_index += 2
+
+                # End token index of the current span in the text.
+                token_end_index = len(input_ids) - 2
+            else:
                 # Start token index of the current span in the text.
                 token_start_index = 0
                 while sequence_ids[token_start_index] != 1:
@@ -160,10 +170,6 @@ def map_fn_DuCheckList_finetune(examples):
                 token_end_index = len(input_ids) - 2
                 while sequence_ids[token_end_index] != 1:
                     token_end_index -= 1
-            else:
-
-                token_start_index = tokenized_example['context_start_id']
-                token_end_index = tokenized_example['context_end_id']
 
             # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
             if not (offsets[token_start_index][0] <= start_char and
-Original file line number
+Diff line change
@@ @@ -0,0 +1,82 @@ @@
 +”
 +。
 +,
 +∈
 +]
 +√
++
 +!
 +(
 +≥
 +【
 +“
 +「
 +÷
 +《
 +】
 +！
 +ˊ
 +」
 +.
 +_
 +@
 +~
 +–
 +〕
 +∶
 +）
 +’
 +℃
 +》
 +〈
 +→
 +、
 +＋
 +|
 +；
 +：
 +∠
 +'
 +‘
 +，
 +？
 +×
 +△
 +－
 +•
 +·
 +—
 +°
 +>
 +′
 +●
 +;
 +…
 +"
 +Ⅱ
 +/
 +<
 ++
 +＝
 +^
 +Ⅰ
 +?
 +[
 +﹑
 +﹐
 +*
 +〔
 +～
 +:
 +（
 +)
 +〉
 +◎
 +=
 +-
 +\
 +%
 +％
 +&
 +≠
 +．