From 1630f8574c3fbb8fb2ab12a5a74176886d8fa5ce Mon Sep 17 00:00:00 2001 From: Akihiro Katsura Date: Tue, 1 Mar 2022 13:13:09 +0900 Subject: [PATCH 1/4] Fix stride on make_spans --- JaQuAD.ipynb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/JaQuAD.ipynb b/JaQuAD.ipynb index 10cfbc9..8d5bb90 100644 --- a/JaQuAD.ipynb +++ b/JaQuAD.ipynb @@ -195,7 +195,8 @@ " val += [padding] * pad_len\n", " return val\n", "\n", - " for i in range(0, input_len - max_seq_len + stride, stride):\n", + " step = max_seq_len - question_len - stride\n", + " for i in range(0, max(context_len - stride, step), step):\n", " span = {key: make_value(val, i) for key, val in inputs.items()}\n", " answer_start = answer_start_position - i\n", " answer_end = answer_end_position - i\n", @@ -686,7 +687,7 @@ " 1:-1].tolist()\n", " end_indexes = np.argsort(end_logits)[-1:-n_best_size - 1:-1].tolist()\n", " cur_offsets = offsets[i:]\n", - " i += doc_stride\n", + " i += max_seq_len - question_len - doc_stride\n", " for start_index in start_indexes:\n", " for end_index in end_indexes:\n", " if 0 < start_index <= end_index < len(cur_offsets):\n", From 48ccd5a0719137130e28a93bbbdbfa3a28260b70 Mon Sep 17 00:00:00 2001 From: Akihiro Katsura Date: Tue, 1 Mar 2022 13:21:28 +0900 Subject: [PATCH 2/4] Fix answer indices --- JaQuAD.ipynb | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/JaQuAD.ipynb b/JaQuAD.ipynb index 8d5bb90..89551b1 100644 --- a/JaQuAD.ipynb +++ b/JaQuAD.ipynb @@ -483,11 +483,12 @@ "\n", " ctx_start = tokens.index(self.tokenizer.sep_token_id) + 1\n", " answer_start_index = ctx_start\n", - " answer_end_index = len(offsets) - 1\n", - " while offsets[answer_start_index][0] < start_char:\n", + " while offsets[answer_start_index][1] < start_char:\n", " answer_start_index += 1\n", - " while offsets[answer_end_index][1] > start_char + len(answer):\n", - " answer_end_index -= 1\n", + " answer_end_index = answer_start_index\n", + " while answer_end_index < len(offsets) \\\n", + " and offsets[answer_end_index][0] < start_char + len(answer):\n", + " answer_end_index += 1\n", "\n", " span_inputs = {\n", " 'input_ids': tokens,\n", From 25fc2860598e1d5aa02b05a2b5f753296f8841f8 Mon Sep 17 00:00:00 2001 From: Akihiro Katsura Date: Tue, 1 Mar 2022 23:23:29 +0900 Subject: [PATCH 3/4] Fix a comment about doc_stride --- JaQuAD.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/JaQuAD.ipynb b/JaQuAD.ipynb index 89551b1..4df868b 100644 --- a/JaQuAD.ipynb +++ b/JaQuAD.ipynb @@ -125,7 +125,7 @@ " 'batch_size': 32, # <=32 for TPUv2-8\n", " 'lr': 2e-5, # Learning Rate\n", " 'max_length': 384, # Max Length input size\n", - " 'doc_stride': 128, # The interval of the context when splitting is needed\n", + " 'doc_stride': 128, # The overlap of the context when splitting is needed\n", " 'epochs': 4, # Max Epochs\n", " 'dataset': 'SkelterLabsInc/JaQuAD',\n", " 'huggingface_auth_token': None,\n", From 3a829c432664cdb8ca33b61077cc14a14c7af613 Mon Sep 17 00:00:00 2001 From: Akihiro Katsura Date: Wed, 2 Mar 2022 21:58:34 +0900 Subject: [PATCH 4/4] Fix get_answers typing --- JaQuAD.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/JaQuAD.ipynb b/JaQuAD.ipynb index 4df868b..fde85ce 100644 --- a/JaQuAD.ipynb +++ b/JaQuAD.ipynb @@ -662,7 +662,7 @@ }, "outputs": [], "source": [ - "def get_answers(model: AutoModelForQuestionAnswering,\n", + "def get_answers(model: QAModel,\n", " context: str,\n", " question: str,\n", " n_best_size: int = 5,\n",