From af2e776252a01fee386ac3b73261142a7b12e12c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joel=20Hsu=28=E5=BE=90=E6=8D=B7=E8=80=80=29?= Date: Fri, 10 Oct 2025 11:25:17 +0800 Subject: [PATCH 1/2] docs: fix tokenizer input by wrapping dataset column with list() --- chapters/zh-TW/chapter3/2.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chapters/zh-TW/chapter3/2.mdx b/chapters/zh-TW/chapter3/2.mdx index 4cc8f883d..f4c6eaed2 100644 --- a/chapters/zh-TW/chapter3/2.mdx +++ b/chapters/zh-TW/chapter3/2.mdx @@ -164,8 +164,8 @@ from transformers import AutoTokenizer checkpoint = "bert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) -tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"]) -tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"]) +tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"])) +tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"])) ``` 然而,在兩句話傳遞給模型,預測這兩句話是否是同義之前。我們需要這兩句話依次進行適當的預處理。幸運的是,標記器不僅僅可以輸入單個句子還可以輸入一組句子,並按照我們的BERT模型所期望的輸入進行處理: From feb3c7893d1c44a018bc40c1eb02075a20f20132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joel=20Hsu=28=E5=BE=90=E6=8D=B7=E8=80=80=29?= Date: Fri, 10 Oct 2025 11:25:17 +0800 Subject: [PATCH 2/2] docs: fix tokenizer input by wrapping dataset column with list() --- chapters/zh-TW/chapter3/2.mdx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/chapters/zh-TW/chapter3/2.mdx b/chapters/zh-TW/chapter3/2.mdx index 4cc8f883d..49039a4ea 100644 --- a/chapters/zh-TW/chapter3/2.mdx +++ b/chapters/zh-TW/chapter3/2.mdx @@ -164,8 +164,8 @@ from transformers import AutoTokenizer checkpoint = "bert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) -tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"]) -tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"]) +tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"])) +tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"])) ``` 然而,在兩句話傳遞給模型,預測這兩句話是否是同義之前。我們需要這兩句話依次進行適當的預處理。幸運的是,標記器不僅僅可以輸入單個句子還可以輸入一組句子,並按照我們的BERT模型所期望的輸入進行處理: @@ -221,8 +221,8 @@ tokenizer.convert_ids_to_tokens(inputs["input_ids"]) ```py tokenized_dataset = tokenizer( - raw_datasets["train"]["sentence1"], - raw_datasets["train"]["sentence2"], + raw_datasets["train"]["sentence1"].to_pylist(), + raw_datasets["train"]["sentence2"].to_pylist(), padding=True, truncation=True, )