Skip to content

Commit 87b72d6

Browse files
authored
fix(HybridChunker): refine max_tokens auto-detection (#306)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent f067c51 commit 87b72d6

File tree

5 files changed

+683
-26
lines changed

5 files changed

+683
-26
lines changed

docling_core/transforms/chunker/hybrid_chunker.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ def _patch(cls, data: Any) -> Any:
8888
"For updated usage check out "
8989
"https://docling-project.github.io/docling/examples/hybrid_chunking/",
9090
DeprecationWarning,
91-
stacklevel=3,
9291
)
9392

9493
if isinstance(tokenizer, str):

docling_core/transforms/chunker/tokenizer/huggingface.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
"""HuggingFace tokenization."""
22

3-
import sys
3+
import json
44
from os import PathLike
55
from typing import Optional, Union
66

7-
from pydantic import ConfigDict, PositiveInt, TypeAdapter, model_validator
7+
from huggingface_hub import hf_hub_download
8+
from pydantic import ConfigDict, model_validator
89
from typing_extensions import Self
910

1011
from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
@@ -28,16 +29,23 @@ class HuggingFaceTokenizer(BaseTokenizer):
2829

2930
@model_validator(mode="after")
3031
def _patch(self) -> Self:
31-
if hasattr(self.tokenizer, "model_max_length"):
32-
model_max_tokens: PositiveInt = TypeAdapter(PositiveInt).validate_python(
33-
self.tokenizer.model_max_length
34-
)
35-
user_max_tokens = self.max_tokens or sys.maxsize
36-
self.max_tokens = min(model_max_tokens, user_max_tokens)
37-
elif self.max_tokens is None:
38-
raise ValueError(
39-
"max_tokens must be defined as model does not define model_max_length"
40-
)
32+
if self.max_tokens is None:
33+
try:
34+
# try to use SentenceTransformers-specific config as that seems to be
35+
# reliable (whenever available)
36+
config_name = "sentence_bert_config.json"
37+
config_path = hf_hub_download(
38+
repo_id=self.tokenizer.name_or_path,
39+
filename=config_name,
40+
)
41+
with open(config_path) as f:
42+
data = json.load(f)
43+
self.max_tokens = int(data["max_seq_length"])
44+
except Exception as e:
45+
raise RuntimeError(
46+
"max_tokens could not be determined automatically; please set "
47+
"explicitly."
48+
) from e
4149
return self
4250

4351
def count_tokens(self, text: str):

test/data/chunker/2c_out_chunks.json

Lines changed: 64 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
}
4141
},
4242
{
43-
"text": "International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries.\nIt is a publicly traded company and one of the 30 companies in the Dow Jones Industrial Average.\nIBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.\nIBM was founded in 1911 as the Computing-Tabulating-Recording Company (CTR), a holding company of manufacturers of record-keeping and measuring systems. It was renamed \"International Business Machines\" in 1924 and soon became the leading manufacturer of punch-card tabulating systems. During the 1960s and 1970s, the IBM mainframe, exemplified by the System/360, was the world's dominant computing platform, with the company producing 80 percent of computers in the U.S. and 70 percent of computers worldwide.[11]\nIBM debuted in the microcomputer market in 1981 with the IBM Personal Computer, - its DOS software provided by Microsoft, - which became the basis for the majority of personal computers to the present day.[12] The company later also found success in the portable space with the ThinkPad. Since the 1990s, IBM has concentrated on computer services, software, supercomputers, and scientific research; it sold its microcomputer division to Lenovo in 2005. IBM continues to develop mainframes, and its supercomputers have consistently ranked among the most powerful in the world in the 21st century.\nAs one of the world's oldest and largest technology companies, IBM has been responsible for several technological innovations, including the automated teller machine (ATM), dynamic random-access memory (DRAM), the floppy disk, the hard disk drive, the magnetic stripe card, the relational database, the SQL programming language, and the UPC barcode. The company has made inroads in advanced computer chips, quantum computing, artificial intelligence, and data infrastructure.[13][14][15] IBM employees and alumni have won various recognitions for their scientific research and inventions, including six Nobel Prizes and six Turing Awards.[16]",
43+
"text": "International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries.\nIt is a publicly traded company and one of the 30 companies in the Dow Jones Industrial Average.\nIBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.\nIBM was founded in 1911 as the Computing-Tabulating-Recording Company (CTR), a holding company of manufacturers of record-keeping and measuring systems. It was renamed \"International Business Machines\" in 1924 and soon became the leading manufacturer of punch-card tabulating systems. During the 1960s and 1970s, the IBM mainframe, exemplified by the System/360, was the world's dominant computing platform, with the company producing 80 percent of computers in the U.S. and 70 percent of computers worldwide.[11]",
4444
"meta": {
4545
"schema_name": "docling_core.transforms.chunker.DocMeta",
4646
"version": "1.0.0",
@@ -144,7 +144,24 @@
144144
]
145145
}
146146
]
147-
},
147+
}
148+
],
149+
"headings": [
150+
"IBM"
151+
],
152+
"origin": {
153+
"mimetype": "application/pdf",
154+
"binary_hash": 15535403176419637685,
155+
"filename": "wiki.pdf"
156+
}
157+
}
158+
},
159+
{
160+
"text": "IBM debuted in the microcomputer market in 1981 with the IBM Personal Computer, - its DOS software provided by Microsoft, - which became the basis for the majority of personal computers to the present day.[12] The company later also found success in the portable space with the ThinkPad. Since the 1990s, IBM has concentrated on computer services, software, supercomputers, and scientific research; it sold its microcomputer division to Lenovo in 2005. IBM continues to develop mainframes, and its supercomputers have consistently ranked among the most powerful in the world in the 21st century.\nAs one of the world's oldest and largest technology companies, IBM has been responsible for several technological innovations, including the automated teller machine (ATM), dynamic random-access memory (DRAM), the floppy disk, the hard disk drive, the magnetic stripe card, the relational database, the SQL programming language, and the UPC barcode. The company has made inroads in advanced computer chips, quantum computing, artificial intelligence, and data infrastructure.[13][14][15] IBM employees and alumni have won various recognitions for their scientific research and inventions, including six Nobel Prizes and six Turing Awards.[16]",
161+
"meta": {
162+
"schema_name": "docling_core.transforms.chunker.DocMeta",
163+
"version": "1.0.0",
164+
"doc_items": [
148165
{
149166
"self_ref": "#/texts/6",
150167
"parent": {
@@ -294,7 +311,51 @@
294311
}
295312
},
296313
{
297-
"text": "Collectively, the companies manufactured a wide array of machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John Henry Patterson, called on Flint and, in 1914, was oiered a position at CTR.[23] Watson joined CTR as general manager and then, 11 months later, was made President when antitrust cases relating to his time at NCR were resolved.[24] Having learned Patterson's pioneering business practices, Watson proceeded to put the stamp of NCR onto CTR's companies.[23]:105 He implemented sales conventions, \"generous sales incentives, a focus on customer service, an insistence on well-groomed, dark-suited salesmen and had an evangelical fervor for instilling company pride and loyalty in every worker\".[25][26] His favorite slogan, \"THINK\", became a mantra for each company's employees.[25] During Watson's first four years, revenues reached $9 million ($158 million today) and the company's operations expanded to Europe, South America, Asia and Australia.[25] Watson never liked the clumsy hyphenated name \"ComputingTabulating-Recording Company\" and chose to replace it with the more expansive title \"International Business Machines\" which had previously been used as the name of CTR's Canadian Division;[27] the name was changed on February 14, 1924.[28] By 1933, most of the subsidiaries had been merged into one company, IBM.",
314+
"text": "Collectively, the companies manufactured a wide array of machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John Henry Patterson, called on Flint and, in 1914, was oiered a position at CTR.[23] Watson joined CTR as general manager and then, 11 months later, was made President when antitrust cases relating to his time at NCR were resolved.[24] Having learned Patterson's pioneering business practices, Watson proceeded to put the stamp of NCR onto CTR's companies.[23]:105 He implemented sales conventions, \"generous sales incentives, a focus on customer service, an insistence on well-groomed, dark-suited salesmen and had an evangelical fervor for instilling company pride and loyalty in every worker\".[25][26] His favorite slogan, \"THINK\", became a mantra for each company's employees.[25] During Watson's first four years, revenues reached $9 million ($158 million today) and the company's operations expanded to Europe, South",
315+
"meta": {
316+
"schema_name": "docling_core.transforms.chunker.DocMeta",
317+
"version": "1.0.0",
318+
"doc_items": [
319+
{
320+
"self_ref": "#/texts/12",
321+
"parent": {
322+
"$ref": "#/body"
323+
},
324+
"children": [],
325+
"content_layer": "body",
326+
"label": "text",
327+
"prov": [
328+
{
329+
"page_no": 2,
330+
"bbox": {
331+
"l": 35.189144134521484,
332+
"t": 561.5880126953125,
333+
"r": 575.9761352539062,
334+
"b": 292.5360107421875,
335+
"coord_origin": "BOTTOMLEFT"
336+
},
337+
"charspan": [
338+
0,
339+
1504
340+
]
341+
}
342+
]
343+
}
344+
],
345+
"headings": [
346+
"IBM",
347+
"History",
348+
"1910s-1950s"
349+
],
350+
"origin": {
351+
"mimetype": "application/pdf",
352+
"binary_hash": 15535403176419637685,
353+
"filename": "wiki.pdf"
354+
}
355+
}
356+
},
357+
{
358+
"text": "America, Asia and Australia.[25] Watson never liked the clumsy hyphenated name \"ComputingTabulating-Recording Company\" and chose to replace it with the more expansive title \"International Business Machines\" which had previously been used as the name of CTR's Canadian Division;[27] the name was changed on February 14, 1924.[28] By 1933, most of the subsidiaries had been merged into one company, IBM.",
298359
"meta": {
299360
"schema_name": "docling_core.transforms.chunker.DocMeta",
300361
"version": "1.0.0",

0 commit comments

Comments
 (0)