Skip to content

Commit 75e422a

Browse files
committed
auto add pad token id into disabled ids for safety
1 parent 93bab29 commit 75e422a

File tree

2 files changed

+6
-1
lines changed

2 files changed

+6
-1
lines changed

src/lib.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,16 @@ impl CodebookConfig {
3535
pad_token_id: usize,
3636
disabled_ids: Option<HashSet<usize>>,
3737
) -> Self {
38+
// insert the pad token id in the disabled ids
39+
let mut disabled_ids = disabled_ids.unwrap_or_else(|| HashSet::new());
40+
disabled_ids.insert(pad_token_id);
41+
3842
Self {
3943
initial_vocab_size,
4044
max_codebook_size,
4145
max_subtokens,
4246
pad_token_id,
43-
disabled_ids: disabled_ids.unwrap_or_default(),
47+
disabled_ids,
4448
}
4549
}
4650
}

tests/test.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use zip2zip_compression::{LZWCompressor, CodebookConfig, PaddingStrategy, Codebo
44
fn get_alphabet_codebook_config() -> CodebookConfig {
55
let mut disabled_ids: HashSet<usize> = HashSet::new();
66
disabled_ids.insert(26); // 'z'
7+
disabled_ids.insert(0); // '\0' padding token
78

89
// 26 letters + 1 for the pad token
910
CodebookConfig::new(27, 100, 5, 0, Some(disabled_ids))

0 commit comments

Comments
 (0)