|
30 | 30 | "metadata": {}, |
31 | 31 | "source": [ |
32 | 32 | "This example leverages the\n", |
33 | | - "[LangChain Docling integration](../../integrations/langchain/), along with\n", |
34 | | - "Milvus-based document store and retriever instances, as well as sentence-transformers\n", |
35 | | - "embeddings.\n", |
| 33 | + "[LangChain Docling integration](../../integrations/langchain/), along with a Milvus\n", |
| 34 | + "vector store, as well as sentence-transformers embeddings.\n", |
36 | 35 | "\n", |
37 | 36 | "The presented `DoclingLoader` component enables you to:\n", |
38 | 37 | "- use various document types in your LLM applications with ease and speed, and\n", |
|
44 | 43 | "- `ExportType.DOC_CHUNKS` (default): if you want to have each input document chunked and\n", |
45 | 44 | " to then capture each individual chunk as a separate LangChain document downstream.\n", |
46 | 45 | "\n", |
47 | | - "The example allows to explore both modes via parameter `EXPORT_TYPE`; depending on the\n", |
48 | | - "value set, the ingestion and RAG pipelines are then set up accordingly." |
| 46 | + "The example allows exploring both modes via parameter `EXPORT_TYPE`; depending on the\n", |
| 47 | + "value set, the example pipeline is then set up accordingly." |
49 | 48 | ] |
50 | 49 | }, |
51 | 50 | { |
|
78 | 77 | } |
79 | 78 | ], |
80 | 79 | "source": [ |
81 | | - "# %pip install -q --progress-bar off --no-warn-conflicts docling-langchain langchain-text-splitters\n", |
82 | | - "%pip install -q --progress-bar off --no-warn-conflicts langchain-text-splitters" |
| 80 | + "%pip install -q --progress-bar off --no-warn-conflicts langchain-docling langchain-core langchain-huggingface langchain_milvus langchain python-dotenv" |
83 | 81 | ] |
84 | 82 | }, |
85 | 83 | { |
|
104 | 102 | "from dotenv import load_dotenv\n", |
105 | 103 | "from langchain_core.prompts import PromptTemplate\n", |
106 | 104 | "\n", |
107 | | - "from docling_langchain.loader import ExportType\n", |
| 105 | + "from langchain_docling.loader import ExportType\n", |
108 | 106 | "\n", |
109 | 107 | "\n", |
110 | 108 | "def _get_env_from_colab_or_os(key):\n", |
|
161 | 159 | } |
162 | 160 | ], |
163 | 161 | "source": [ |
164 | | - "from docling_langchain import DoclingLoader\n", |
| 162 | + "from docling.chunking import HybridChunker\n", |
| 163 | + "\n", |
| 164 | + "from langchain_docling import DoclingLoader\n", |
165 | 165 | "\n", |
166 | 166 | "loader = DoclingLoader(\n", |
167 | 167 | " file_path=FILE_PATH,\n", |
168 | 168 | " export_type=EXPORT_TYPE,\n", |
| 169 | + " chunker=HybridChunker(tokenizer=EMBED_MODEL_ID),\n", |
169 | 170 | ")\n", |
170 | 171 | "\n", |
171 | 172 | "docs = loader.load()" |
|
257 | 258 | "vectorstore = Milvus.from_documents(\n", |
258 | 259 | " documents=splits,\n", |
259 | 260 | " embedding=embedding,\n", |
| 261 | + " collection_name=\"docling_demo\",\n", |
260 | 262 | " connection_args={\"uri\": milvus_uri},\n", |
261 | 263 | " index_params={\"index_type\": \"FLAT\"},\n", |
262 | 264 | " drop_old=True,\n", |
|
274 | 276 | "cell_type": "code", |
275 | 277 | "execution_count": 7, |
276 | 278 | "metadata": {}, |
| 279 | + "outputs": [], |
| 280 | + "source": [ |
| 281 | + "from langchain.chains import create_retrieval_chain\n", |
| 282 | + "from langchain.chains.combine_documents import create_stuff_documents_chain\n", |
| 283 | + "from langchain_huggingface import HuggingFaceEndpoint\n", |
| 284 | + "\n", |
| 285 | + "retriever = vectorstore.as_retriever(search_kwargs={\"k\": TOP_K})\n", |
| 286 | + "llm = HuggingFaceEndpoint(\n", |
| 287 | + " repo_id=GEN_MODEL_ID,\n", |
| 288 | + " huggingfacehub_api_token=HF_TOKEN,\n", |
| 289 | + ")\n", |
| 290 | + "\n", |
| 291 | + "\n", |
| 292 | + "def clip_text(text, threshold=100):\n", |
| 293 | + " return f\"{text[:threshold]}...\" if len(text) > threshold else text" |
| 294 | + ] |
| 295 | + }, |
| 296 | + { |
| 297 | + "cell_type": "code", |
| 298 | + "execution_count": 15, |
| 299 | + "metadata": {}, |
277 | 300 | "outputs": [ |
278 | 301 | { |
279 | 302 | "name": "stdout", |
|
283 | 306 | "Which are the main AI models in Docling?\n", |
284 | 307 | "\n", |
285 | 308 | "Answer:\n", |
286 | | - "\"The main AI models in Docling are:\\n1. A layout analysis model, an accurate object-detector for page elements.\\n2. TableFormer, a state-of-the-art table structure recognition model.\"\n", |
| 309 | + "Docling currently supports two main AI models, namely a layout analysis model and a table structure recognition model. The first model is a layout analysis model, an accurate object-detector for page ...\n", |
287 | 310 | "\n", |
288 | 311 | "Source 1:\n", |
289 | | - " text: \"3.2 AI models\\nAs part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a lay...\"\n", |
| 312 | + " text: \"3.2 AI models\\nAs part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure re...\"\n", |
290 | 313 | " dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/50', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 3, 'bbox': {'l': 108.0, 't': 405.1419982910156, 'r': 504.00299072265625, 'b': 330.7799987792969, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 608]}]}], 'headings': ['3.2 AI models'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n", |
291 | 314 | " source: https://arxiv.org/pdf/2408.09869\n", |
292 | 315 | "\n", |
293 | 316 | "Source 2:\n", |
294 | | - " text: \"3 Processing pipeline\\nDocling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieve...\"\n", |
| 317 | + " text: \"3 Processing pipeline\\nDocling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support ...\"\n", |
295 | 318 | " dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/26', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 2, 'bbox': {'l': 108.0, 't': 273.01800537109375, 'r': 504.00299072265625, 'b': 176.83799743652344, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 796]}]}], 'headings': ['3 Processing pipeline'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n", |
296 | 319 | " source: https://arxiv.org/pdf/2408.09869\n", |
297 | 320 | "\n", |
298 | 321 | "Source 3:\n", |
299 | | - " text: \"6 Future work and contributions\\nDocling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-clas...\"\n", |
| 322 | + " text: \"6 Future work and contributions\\nDocling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of ...\"\n", |
300 | 323 | " dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/76', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 5, 'bbox': {'l': 108.0, 't': 322.468994140625, 'r': 504.00299072265625, 'b': 259.0169982910156, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 543]}]}, {'self_ref': '#/texts/77', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 5, 'bbox': {'l': 108.0, 't': 251.6540069580078, 'r': 504.00299072265625, 'b': 198.99200439453125, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 402]}]}], 'headings': ['6 Future work and contributions'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n", |
301 | | - " source: https://arxiv.org/pdf/2408.09869\n", |
302 | | - "\n", |
303 | | - "Source 4:\n", |
304 | | - " text: \"3.3 Assembly\\nIn the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliar...\"\n", |
305 | | - " dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/62', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 4, 'bbox': {'l': 108.0, 't': 506.08099365234375, 'r': 504.00299072265625, 'b': 431.718994140625, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 622]}]}], 'headings': ['3.3 Assembly'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n", |
306 | 324 | " source: https://arxiv.org/pdf/2408.09869\n" |
307 | 325 | ] |
308 | 326 | } |
309 | 327 | ], |
310 | 328 | "source": [ |
311 | | - "from langchain.chains import create_retrieval_chain\n", |
312 | | - "from langchain.chains.combine_documents import create_stuff_documents_chain\n", |
313 | | - "from langchain_huggingface import HuggingFaceEndpoint\n", |
314 | | - "\n", |
315 | | - "llm = HuggingFaceEndpoint(repo_id=GEN_MODEL_ID)\n", |
316 | | - "\n", |
317 | | - "\n", |
318 | | - "def clip_text(text, threshold=100):\n", |
319 | | - " return f\"{text[:threshold]}...\" if len(text) > threshold else text\n", |
320 | | - "\n", |
321 | | - "\n", |
322 | | - "retriever = vectorstore.as_retriever()\n", |
323 | 329 | "question_answer_chain = create_stuff_documents_chain(llm, PROMPT)\n", |
324 | 330 | "rag_chain = create_retrieval_chain(retriever, question_answer_chain)\n", |
325 | 331 | "resp_dict = rag_chain.invoke({\"input\": QUESTION})\n", |
326 | 332 | "\n", |
327 | | - "answer = clip_text(resp_dict[\"answer\"], threshold=200)\n", |
328 | | - "print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{json.dumps(answer)}\")\n", |
| 333 | + "clipped_answer = clip_text(resp_dict[\"answer\"], threshold=200)\n", |
| 334 | + "print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n", |
329 | 335 | "for i, doc in enumerate(resp_dict[\"context\"]):\n", |
330 | 336 | " print()\n", |
331 | 337 | " print(f\"Source {i+1}:\")\n", |
332 | | - " print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=200))}\")\n", |
| 338 | + " print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n", |
333 | 339 | " for key in doc.metadata:\n", |
334 | 340 | " if key != \"pk\":\n", |
335 | 341 | " val = doc.metadata.get(key)\n", |
|
361 | 367 | "name": "python", |
362 | 368 | "nbconvert_exporter": "python", |
363 | 369 | "pygments_lexer": "ipython3", |
364 | | - "version": "3.12.7" |
| 370 | + "version": "3.12.8" |
365 | 371 | } |
366 | 372 | }, |
367 | 373 | "nbformat": 4, |
|
0 commit comments