Skip to content

Commit 12f9e97

Browse files
committed
(wip) xet chunk code generated by cursor to fix
1 parent 3306f2c commit 12f9e97

File tree

1 file changed

+172
-0
lines changed

1 file changed

+172
-0
lines changed
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
import { nextMatch } from "./next-match";
2+
3+
// Constants
4+
const TARGET_CHUNK_SIZE: usize = 64 * 1024; // 64KB
5+
const MINIMUM_CHUNK_DIVISOR: usize = 8;
6+
const MAXIMUM_CHUNK_MULTIPLIER: usize = 2;
7+
const HASH_WINDOW_SIZE: usize = 64;
8+
9+
export class Chunk {
10+
hash: Uint8Array;
11+
data: Uint8Array;
12+
13+
constructor(hash: Uint8Array, data: Uint8Array) {
14+
this.hash = hash;
15+
this.data = data;
16+
}
17+
}
18+
19+
// Type for the next() method return value
20+
export class NextResult {
21+
chunk: Chunk | null;
22+
bytesConsumed: usize;
23+
24+
constructor(chunk: Chunk | null, bytesConsumed: usize) {
25+
this.chunk = chunk;
26+
this.bytesConsumed = bytesConsumed;
27+
}
28+
}
29+
30+
export class XetChunker {
31+
private minimumChunk: usize;
32+
private maximumChunk: usize;
33+
private mask: u64;
34+
private chunkBuf: Uint8Array;
35+
private curChunkLen: usize;
36+
private hash: u64;
37+
38+
constructor(targetChunkSize: usize = TARGET_CHUNK_SIZE) {
39+
// Validate target chunk size is a power of 2
40+
assert((targetChunkSize & (targetChunkSize - 1)) == 0, "Target chunk size must be a power of 2");
41+
assert(targetChunkSize > HASH_WINDOW_SIZE, "Target chunk size must be greater than hash window size");
42+
assert(targetChunkSize < u32.MAX_VALUE, "Target chunk size must be less than u32.MAX_VALUE");
43+
44+
let mask = (targetChunkSize - 1) as u64;
45+
// Shift mask left by leading zeros count
46+
mask = mask << (64 - clz(mask));
47+
48+
this.minimumChunk = targetChunkSize / MINIMUM_CHUNK_DIVISOR;
49+
this.maximumChunk = targetChunkSize * MAXIMUM_CHUNK_MULTIPLIER;
50+
this.mask = mask;
51+
this.chunkBuf = new Uint8Array(this.maximumChunk);
52+
this.curChunkLen = 0;
53+
this.hash = 0;
54+
}
55+
56+
next(data: Uint8Array, isFinal: boolean): NextResult {
57+
const nBytes = data.length;
58+
let createChunk = false;
59+
let consumeLen: usize = 0;
60+
61+
if (nBytes != 0) {
62+
// Skip minimum chunk size
63+
if (this.curChunkLen + HASH_WINDOW_SIZE < this.minimumChunk) {
64+
const maxAdvance = min(this.minimumChunk - this.curChunkLen - HASH_WINDOW_SIZE - 1, nBytes - consumeLen);
65+
consumeLen += maxAdvance;
66+
this.curChunkLen += maxAdvance;
67+
}
68+
69+
// Calculate read end
70+
const readEnd = min(nBytes, consumeLen + this.maximumChunk - this.curChunkLen);
71+
72+
let bytesToNextBoundary: usize;
73+
const matchResult = nextMatch(data.subarray(consumeLen, readEnd), this.mask, this.hash);
74+
75+
if (matchResult.position != -1) {
76+
bytesToNextBoundary = matchResult.position;
77+
createChunk = true;
78+
this.hash = matchResult.hash;
79+
} else {
80+
bytesToNextBoundary = readEnd - consumeLen;
81+
this.hash = matchResult.hash;
82+
}
83+
84+
// Check if we hit maximum chunk
85+
if (bytesToNextBoundary + this.curChunkLen >= this.maximumChunk) {
86+
bytesToNextBoundary = this.maximumChunk - this.curChunkLen;
87+
createChunk = true;
88+
}
89+
90+
this.curChunkLen += bytesToNextBoundary;
91+
consumeLen += bytesToNextBoundary;
92+
93+
// Copy data to chunk buffer
94+
this.chunkBuf.set(data.subarray(0, consumeLen), this.curChunkLen - consumeLen);
95+
}
96+
97+
if (createChunk || (isFinal && this.curChunkLen > 0)) {
98+
const chunkData = this.chunkBuf.subarray(0, this.curChunkLen);
99+
const chunk = new Chunk(computeDataHash(chunkData), chunkData);
100+
this.curChunkLen = 0;
101+
this.hash = 0;
102+
return new NextResult(chunk, consumeLen);
103+
}
104+
105+
return new NextResult(null, consumeLen);
106+
}
107+
108+
nextBlock(data: Uint8Array, isFinal: boolean): Chunk[] {
109+
const chunks: Chunk[] = [];
110+
let pos: usize = 0;
111+
112+
while (pos < data.length) {
113+
const result = this.next(data.subarray(pos), isFinal);
114+
if (result.chunk) {
115+
chunks.push(result.chunk);
116+
}
117+
pos += result.bytesConsumed;
118+
}
119+
120+
return chunks;
121+
}
122+
123+
finish(): Chunk | null {
124+
return this.next(new Uint8Array(0), true).chunk;
125+
}
126+
}
127+
128+
// Simple SHA-256 implementation for data hashing
129+
function computeDataHash(data: Uint8Array): Uint8Array {
130+
// TODO: Replace with actual SHA-256 implementation
131+
// For now, using a simple hash function for demonstration
132+
const hash = new Uint8Array(32);
133+
for (let i = 0; i < data.length; i++) {
134+
hash[i % 32] ^= data[i];
135+
}
136+
return hash;
137+
}
138+
139+
// Helper function to find minimum of two numbers
140+
function min(a: usize, b: usize): usize {
141+
return a < b ? a : b;
142+
}
143+
144+
// Helper function to count leading zeros
145+
function clz(x: u64): u32 {
146+
let n: u32 = 0;
147+
if (x == 0) return 64;
148+
if ((x & 0xffffffff00000000) == 0) {
149+
n += 32;
150+
x <<= 32;
151+
}
152+
if ((x & 0xffff000000000000) == 0) {
153+
n += 16;
154+
x <<= 16;
155+
}
156+
if ((x & 0xff00000000000000) == 0) {
157+
n += 8;
158+
x <<= 8;
159+
}
160+
if ((x & 0xf000000000000000) == 0) {
161+
n += 4;
162+
x <<= 4;
163+
}
164+
if ((x & 0xc000000000000000) == 0) {
165+
n += 2;
166+
x <<= 2;
167+
}
168+
if ((x & 0x8000000000000000) == 0) {
169+
n += 1;
170+
}
171+
return n;
172+
}

0 commit comments

Comments
 (0)