Skip to content

Commit 44f9979

Browse files
committed
Patch tokenizers 0.13.3 to build with newer rust
1 parent 1e0be28 commit 44f9979

File tree

1 file changed

+26
-0
lines changed

1 file changed

+26
-0
lines changed

graalpython/lib-graalpython/patches/tokenizers/tokenizers-0.13.3.patch

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,29 @@ index 6282c31..47e6b12 100644
2222

2323
[features]
2424
default = ["pyo3/extension-module"]
25+
diff --git a/tokenizers-lib/src/models/bpe/trainer.rs b/tokenizers-lib/src/models/bpe/trainer.rs
26+
index 43ab848..55f95f8 100644
27+
--- a/tokenizers-lib/src/models/bpe/trainer.rs
28+
+++ b/tokenizers-lib/src/models/bpe/trainer.rs
29+
@@ -518,15 +518,16 @@ impl BpeTrainer {
30+
let changes = top
31+
.pos
32+
.maybe_par_iter()
33+
- .flat_map(|i| {
34+
- let w = &words[*i] as *const _ as *mut _;
35+
+ .flat_map(|&i| {
36+
+ let word = &words[i] as *const _ as *mut Word;
37+
// We can merge each of these words in parallel here because each position
38+
// can be there only once (HashSet). So this is safe.
39+
unsafe {
40+
- let word: &mut Word = &mut (*w);
41+
- word.merge(top.pair.0, top.pair.1, new_token_id)
42+
+ // let word: &mut Word = &mut (*word);
43+
+ (*word)
44+
+ .merge(top.pair.0, top.pair.1, new_token_id)
45+
.into_iter()
46+
- .map(|c| (c, *i))
47+
+ .map(|c| (c, i))
48+
.collect::<Vec<_>>()
49+
}
50+
})

0 commit comments

Comments
 (0)