Skip to content

Commit 3ff4eb2

Browse files
authored
Thin wasm: do not automatically set is_dedup to true for first chunk (#481)
Related to huggingface/huggingface.js#1718 We'll want to edit parts of file while loading old data's dedup info In those case we don't always want to load dedup info for the first chunk (since it may not be at the beginning of the file) So the is_dedup = true for first chunk is handled client side
1 parent cc247a9 commit 3ff4eb2

File tree

1 file changed

+4
-6
lines changed

1 file changed

+4
-6
lines changed

hf_xet_thin_wasm/src/lib.rs

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,12 @@ pub struct JsChunkOut {
2828

2929

3030
impl JsChunkOut {
31-
fn new_with_dedup(chunk: deduplication::Chunk, is_first_chunk: bool) -> Self {
31+
fn new_with_dedup(chunk: deduplication::Chunk) -> Self {
3232
let hash_eligible = mdb_shard::constants::hash_is_global_dedup_eligible(&chunk.hash);
3333
JsChunkOut {
3434
hash: chunk.hash.hex(),
3535
length: chunk.data.len() as u32,
36-
dedup: is_first_chunk || hash_eligible,
36+
dedup: hash_eligible,
3737
}
3838
}
3939
}
@@ -61,8 +61,7 @@ impl JsChunker {
6161
let mut serializable_result: Vec<JsChunkOut> = Vec::with_capacity(result.len());
6262

6363
for chunk in result {
64-
let is_first = !self.first_chunk_outputted;
65-
serializable_result.push(JsChunkOut::new_with_dedup(chunk, is_first));
64+
serializable_result.push(JsChunkOut::new_with_dedup(chunk));
6665
self.first_chunk_outputted = true;
6766
}
6867

@@ -72,8 +71,7 @@ impl JsChunker {
7271
pub fn finish(&mut self) -> Result<JsValue, JsValue> {
7372
let mut result: Vec<JsChunkOut> = vec![];
7473
if let Some(final_chunk) = self.inner.finish() {
75-
let is_first = !self.first_chunk_outputted;
76-
result.push(JsChunkOut::new_with_dedup(final_chunk, is_first));
74+
result.push(JsChunkOut::new_with_dedup(final_chunk));
7775
self.first_chunk_outputted = true;
7876
};
7977

0 commit comments

Comments
 (0)