|
66 | 66 | "cell_type": "code", |
67 | 67 | "execution_count": 2, |
68 | 68 | "metadata": {}, |
69 | | - "outputs": [ |
70 | | - { |
71 | | - "name": "stdout", |
72 | | - "output_type": "stream", |
73 | | - "text": [ |
74 | | - "chunker.max_tokens=512\n" |
75 | | - ] |
76 | | - } |
77 | | - ], |
| 69 | + "outputs": [], |
78 | 70 | "source": [ |
79 | 71 | "from transformers import AutoTokenizer\n", |
80 | 72 | "\n", |
81 | 73 | "EMBED_MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n", |
82 | | - "tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)\n", |
83 | | - "chunker = HybridChunker(tokenizer=tokenizer)\n", |
84 | | - "print(f\"{chunker.max_tokens=}\")" |
| 74 | + "tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_ID)" |
85 | 75 | ] |
86 | 76 | }, |
87 | 77 | { |
|
202 | 192 | } |
203 | 193 | ], |
204 | 194 | "source": [ |
| 195 | + "chunker = HybridChunker(tokenizer=tokenizer)\n", |
| 196 | + "\n", |
205 | 197 | "chunk_iter = chunker.chunk(dl_doc=doc)\n", |
206 | 198 | "\n", |
207 | 199 | "chunks = list(chunk_iter)\n", |
|
279 | 271 | } |
280 | 272 | ], |
281 | 273 | "source": [ |
282 | | - "doc_serializer = ChunkingDocSerializer(\n", |
283 | | - " doc=doc,\n", |
284 | | - " table_serializer=MarkdownTableSerializer(), # configuring a different table serializer\n", |
| 274 | + "from docling_core.transforms.chunker.hierarchical_chunker import ChunkingSerializerProvider\n", |
| 275 | + "\n", |
| 276 | + "\n", |
| 277 | + "class MDTableSerializerProvider(ChunkingSerializerProvider):\n", |
| 278 | + " def get_serializer(self, doc):\n", |
| 279 | + " return ChunkingDocSerializer(\n", |
| 280 | + " doc=doc,\n", |
| 281 | + " table_serializer=MarkdownTableSerializer(), # configuring a different table serializer\n", |
| 282 | + " )\n", |
| 283 | + "\n", |
| 284 | + "chunker = HybridChunker(\n", |
| 285 | + " tokenizer=tokenizer,\n", |
| 286 | + " serializer_provider=MDTableSerializerProvider(),\n", |
285 | 287 | ")\n", |
286 | 288 | "\n", |
287 | | - "chunk_iter = chunker.chunk(dl_doc=doc, doc_serializer=doc_serializer)\n", |
| 289 | + "chunk_iter = chunker.chunk(dl_doc=doc)\n", |
288 | 290 | "\n", |
289 | 291 | "chunks = list(chunk_iter)\n", |
290 | 292 | "i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.TABLE)\n", |
|
355 | 357 | "source": [ |
356 | 358 | "from docling_core.experimental.serializer.markdown import MarkdownParams\n", |
357 | 359 | "\n", |
358 | | - "doc_serializer = ChunkingDocSerializer(\n", |
359 | | - " doc=doc,\n", |
360 | | - " params=MarkdownParams(\n", |
361 | | - " image_placeholder=\"<!-- image -->\",\n", |
362 | | - " ),\n", |
| 360 | + "class ImgPlaceholderSerializerProvider(ChunkingSerializerProvider):\n", |
| 361 | + " def get_serializer(self, doc):\n", |
| 362 | + " return ChunkingDocSerializer(\n", |
| 363 | + " doc=doc,\n", |
| 364 | + " params=MarkdownParams(\n", |
| 365 | + " image_placeholder=\"<!-- image -->\",\n", |
| 366 | + " ),\n", |
| 367 | + " )\n", |
| 368 | + "\n", |
| 369 | + "chunker = HybridChunker(\n", |
| 370 | + " tokenizer=tokenizer,\n", |
| 371 | + " serializer_provider=ImgPlaceholderSerializerProvider(),\n", |
363 | 372 | ")\n", |
364 | 373 | "\n", |
365 | | - "chunk_iter = chunker.chunk(dl_doc=doc, doc_serializer=doc_serializer)\n", |
| 374 | + "chunk_iter = chunker.chunk(dl_doc=doc)\n", |
366 | 375 | "\n", |
367 | 376 | "chunks = list(chunk_iter)\n", |
368 | 377 | "i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.PICTURE)\n", |
|
466 | 475 | } |
467 | 476 | ], |
468 | 477 | "source": [ |
469 | | - "doc_serializer = ChunkingDocSerializer(\n", |
470 | | - " doc=doc,\n", |
471 | | - " picture_serializer=AnnotationPictureSerializer(), # configuring a different picture serializer\n", |
| 478 | + "class ImgAnnotationSerializerProvider(ChunkingSerializerProvider):\n", |
| 479 | + " def get_serializer(self, doc):\n", |
| 480 | + " return ChunkingDocSerializer(\n", |
| 481 | + " doc=doc,\n", |
| 482 | + " picture_serializer=AnnotationPictureSerializer(), # configuring a different picture serializer\n", |
| 483 | + " )\n", |
| 484 | + "\n", |
| 485 | + "chunker = HybridChunker(\n", |
| 486 | + " tokenizer=tokenizer,\n", |
| 487 | + " serializer_provider=ImgAnnotationSerializerProvider(),\n", |
472 | 488 | ")\n", |
473 | 489 | "\n", |
474 | | - "chunk_iter = chunker.chunk(dl_doc=doc, doc_serializer=doc_serializer)\n", |
| 490 | + "chunk_iter = chunker.chunk(dl_doc=doc)\n", |
475 | 491 | "\n", |
476 | 492 | "chunks = list(chunk_iter)\n", |
477 | 493 | "i, chunk = find_n_th_chunk_with_label(chunks, n=0, label=DocItemLabel.PICTURE)\n", |
|
0 commit comments