Skip to content

Commit f2ea5cf

Browse files
committed
Loading text tutorial: fixed OOV handling
1 parent e4ab8e0 commit f2ea5cf

File tree

1 file changed

+5
-2
lines changed

1 file changed

+5
-2
lines changed

site/en/tutorials/load_data/text.ipynb

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1140,8 +1140,9 @@
11401140
},
11411141
"outputs": [],
11421142
"source": [
1143-
"keys = vocab\n",
1144-
"values = range(2, len(vocab) + 2) # Reserve `0` for padding, `1` for OOV tokens.\n",
1143+
"# Reserve `0` for padding, `1` for OOV tokens.\n",
1144+
"keys = ['', '[UNK]'] + vocab\n",
1145+
"values = range(len(keys))\n",
11451146
"\n",
11461147
"init = tf.lookup.KeyValueTensorInitializer(\n",
11471148
" keys, values, key_dtype=tf.string, value_dtype=tf.int64)\n",
@@ -1171,6 +1172,8 @@
11711172
" standardized = tf_text.case_fold_utf8(text)\n",
11721173
" tokenized = tokenizer.tokenize(standardized)\n",
11731174
" vectorized = vocab_table.lookup(tokenized)\n",
1175+
" # StaticVocabularyTable returns the OOV token as vocab_size + 2. We overwrite it to be 1.\n",
1176+
" vectorized = tf.where(vectorized == len(keys), tf.constant(1, dtype=tf.int64), vectorized)\n",
11741177
" return vectorized, label"
11751178
]
11761179
},

0 commit comments

Comments
 (0)