Skip to content

Commit 419252c

Browse files
authored
fix(HybridChunker): remove max_length from tokenization (#178)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 5b09c28 commit 419252c

File tree

2 files changed

+4
-4
lines changed

2 files changed

+4
-4
lines changed

docling_core/transforms/chunker/hybrid_chunker.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def _count_text_tokens(self, text: Optional[Union[str, list[str]]]):
7373
for t in text:
7474
total += self._count_text_tokens(t)
7575
return total
76-
return len(self._tokenizer.tokenize(text, max_length=None))
76+
return len(self._tokenizer.tokenize(text))
7777

7878
class _ChunkLengthInfo(BaseModel):
7979
total_len: int
@@ -82,7 +82,7 @@ class _ChunkLengthInfo(BaseModel):
8282

8383
def _count_chunk_tokens(self, doc_chunk: DocChunk):
8484
ser_txt = self.serialize(chunk=doc_chunk)
85-
return len(self._tokenizer.tokenize(text=ser_txt, max_length=None))
85+
return len(self._tokenizer.tokenize(text=ser_txt))
8686

8787
def _doc_chunk_length(self, doc_chunk: DocChunk):
8888
text_length = self._count_text_tokens(doc_chunk.text)

test/test_hybrid_chunker.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def test_serialize():
9898
dict(
9999
text=chunk.text,
100100
ser_text=(ser_text := chunker.serialize(chunk)),
101-
num_tokens=len(TOKENIZER.tokenize(ser_text, max_length=None)),
101+
num_tokens=len(TOKENIZER.tokenize(ser_text)),
102102
)
103103
for chunk in chunks
104104
]
@@ -171,7 +171,7 @@ def test_serialize_altered_delim():
171171
dict(
172172
text=chunk.text,
173173
ser_text=(ser_text := chunker.serialize(chunk)),
174-
num_tokens=len(TOKENIZER.tokenize(ser_text, max_length=None)),
174+
num_tokens=len(TOKENIZER.tokenize(ser_text)),
175175
)
176176
for chunk in chunks
177177
]

0 commit comments

Comments
 (0)