Skip to content

Commit bc98187

Browse files
authored
feat: rename distribution pkg & import pkg (#2)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent ca0a964 commit bc98187

File tree

8 files changed

+54
-48
lines changed

8 files changed

+54
-48
lines changed

.pre-commit-config.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,31 +4,31 @@ repos:
44
hooks:
55
- id: black
66
name: Black
7-
entry: poetry run black docling_langchain test
7+
entry: poetry run black langchain_docling test
88
pass_filenames: false
99
language: system
1010
files: '\.py$'
1111
- id: isort
1212
name: isort
13-
entry: poetry run isort docling_langchain test
13+
entry: poetry run isort langchain_docling test
1414
pass_filenames: false
1515
language: system
1616
files: '\.py$'
1717
- id: autoflake
1818
name: autoflake
19-
entry: poetry run autoflake docling_langchain test
19+
entry: poetry run autoflake langchain_docling test
2020
pass_filenames: false
2121
language: system
2222
files: '\.py$'
2323
- id: mypy
2424
name: MyPy
25-
entry: poetry run mypy docling_langchain test
25+
entry: poetry run mypy langchain_docling test
2626
pass_filenames: false
2727
language: system
2828
files: '\.py$'
2929
- id: flake8
3030
name: Flake8
31-
entry: poetry run flake8 docling_langchain
31+
entry: poetry run flake8 langchain_docling
3232
pass_filenames: false
3333
language: system
3434
files: '\.py$'

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Docling LangChain integration
22

3-
[![PyPI version](https://img.shields.io/pypi/v/docling-langchain)](https://pypi.org/project/docling-langchain/)
4-
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling-langchain)](https://pypi.org/project/docling-langchain/)
3+
[![PyPI version](https://img.shields.io/pypi/v/langchain-docling)](https://pypi.org/project/langchain-docling/)
4+
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/langchain-docling)](https://pypi.org/project/langchain-docling/)
55
[![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
66
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
77
[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
@@ -14,17 +14,17 @@ A [Docling](https://github.com/DS4SD/docling) integration for
1414

1515
## Installation
1616

17-
Simply install `docling-langchain` from your package manager, e.g. pip:
17+
Simply install `langchain-docling` from your package manager, e.g. pip:
1818
```bash
19-
pip install docling-langchain
19+
pip install langchain-docling
2020
```
2121

2222
## Usage
2323

2424
Basic usage looks as follows:
2525

2626
```python
27-
from docling_langchain import DoclingLoader
27+
from langchain_docling import DoclingLoader
2828

2929
FILE_PATH = ["https://arxiv.org/pdf/2408.09869"] # Docling Technical Report
3030

examples/docling_loader.ipynb

Lines changed: 40 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,8 @@
3030
"metadata": {},
3131
"source": [
3232
"This example leverages the\n",
33-
"[LangChain Docling integration](../../integrations/langchain/), along with\n",
34-
"Milvus-based document store and retriever instances, as well as sentence-transformers\n",
35-
"embeddings.\n",
33+
"[LangChain Docling integration](../../integrations/langchain/), along with a Milvus\n",
34+
"vector store, as well as sentence-transformers embeddings.\n",
3635
"\n",
3736
"The presented `DoclingLoader` component enables you to:\n",
3837
"- use various document types in your LLM applications with ease and speed, and\n",
@@ -44,8 +43,8 @@
4443
"- `ExportType.DOC_CHUNKS` (default): if you want to have each input document chunked and\n",
4544
" to then capture each individual chunk as a separate LangChain document downstream.\n",
4645
"\n",
47-
"The example allows to explore both modes via parameter `EXPORT_TYPE`; depending on the\n",
48-
"value set, the ingestion and RAG pipelines are then set up accordingly."
46+
"The example allows exploring both modes via parameter `EXPORT_TYPE`; depending on the\n",
47+
"value set, the example pipeline is then set up accordingly."
4948
]
5049
},
5150
{
@@ -78,8 +77,7 @@
7877
}
7978
],
8079
"source": [
81-
"# %pip install -q --progress-bar off --no-warn-conflicts docling-langchain langchain-text-splitters\n",
82-
"%pip install -q --progress-bar off --no-warn-conflicts langchain-text-splitters"
80+
"%pip install -q --progress-bar off --no-warn-conflicts langchain-docling langchain-core langchain-huggingface langchain_milvus langchain python-dotenv"
8381
]
8482
},
8583
{
@@ -104,7 +102,7 @@
104102
"from dotenv import load_dotenv\n",
105103
"from langchain_core.prompts import PromptTemplate\n",
106104
"\n",
107-
"from docling_langchain.loader import ExportType\n",
105+
"from langchain_docling.loader import ExportType\n",
108106
"\n",
109107
"\n",
110108
"def _get_env_from_colab_or_os(key):\n",
@@ -161,11 +159,14 @@
161159
}
162160
],
163161
"source": [
164-
"from docling_langchain import DoclingLoader\n",
162+
"from docling.chunking import HybridChunker\n",
163+
"\n",
164+
"from langchain_docling import DoclingLoader\n",
165165
"\n",
166166
"loader = DoclingLoader(\n",
167167
" file_path=FILE_PATH,\n",
168168
" export_type=EXPORT_TYPE,\n",
169+
" chunker=HybridChunker(tokenizer=EMBED_MODEL_ID),\n",
169170
")\n",
170171
"\n",
171172
"docs = loader.load()"
@@ -257,6 +258,7 @@
257258
"vectorstore = Milvus.from_documents(\n",
258259
" documents=splits,\n",
259260
" embedding=embedding,\n",
261+
" collection_name=\"docling_demo\",\n",
260262
" connection_args={\"uri\": milvus_uri},\n",
261263
" index_params={\"index_type\": \"FLAT\"},\n",
262264
" drop_old=True,\n",
@@ -274,6 +276,27 @@
274276
"cell_type": "code",
275277
"execution_count": 7,
276278
"metadata": {},
279+
"outputs": [],
280+
"source": [
281+
"from langchain.chains import create_retrieval_chain\n",
282+
"from langchain.chains.combine_documents import create_stuff_documents_chain\n",
283+
"from langchain_huggingface import HuggingFaceEndpoint\n",
284+
"\n",
285+
"retriever = vectorstore.as_retriever(search_kwargs={\"k\": TOP_K})\n",
286+
"llm = HuggingFaceEndpoint(\n",
287+
" repo_id=GEN_MODEL_ID,\n",
288+
" huggingfacehub_api_token=HF_TOKEN,\n",
289+
")\n",
290+
"\n",
291+
"\n",
292+
"def clip_text(text, threshold=100):\n",
293+
" return f\"{text[:threshold]}...\" if len(text) > threshold else text"
294+
]
295+
},
296+
{
297+
"cell_type": "code",
298+
"execution_count": 15,
299+
"metadata": {},
277300
"outputs": [
278301
{
279302
"name": "stdout",
@@ -283,53 +306,36 @@
283306
"Which are the main AI models in Docling?\n",
284307
"\n",
285308
"Answer:\n",
286-
"\"The main AI models in Docling are:\\n1. A layout analysis model, an accurate object-detector for page elements.\\n2. TableFormer, a state-of-the-art table structure recognition model.\"\n",
309+
"Docling currently supports two main AI models, namely a layout analysis model and a table structure recognition model. The first model is a layout analysis model, an accurate object-detector for page ...\n",
287310
"\n",
288311
"Source 1:\n",
289-
" text: \"3.2 AI models\\nAs part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a lay...\"\n",
312+
" text: \"3.2 AI models\\nAs part of Docling, we initially release two highly capable AI models to the open-source community, which have been developed and published recently by our team. The first model is a layout analysis model, an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], a state-of-the-art table structure re...\"\n",
290313
" dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/50', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 3, 'bbox': {'l': 108.0, 't': 405.1419982910156, 'r': 504.00299072265625, 'b': 330.7799987792969, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 608]}]}], 'headings': ['3.2 AI models'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n",
291314
" source: https://arxiv.org/pdf/2408.09869\n",
292315
"\n",
293316
"Source 2:\n",
294-
" text: \"3 Processing pipeline\\nDocling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieve...\"\n",
317+
" text: \"3 Processing pipeline\\nDocling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support ...\"\n",
295318
" dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/26', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 2, 'bbox': {'l': 108.0, 't': 273.01800537109375, 'r': 504.00299072265625, 'b': 176.83799743652344, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 796]}]}], 'headings': ['3 Processing pipeline'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n",
296319
" source: https://arxiv.org/pdf/2408.09869\n",
297320
"\n",
298321
"Source 3:\n",
299-
" text: \"6 Future work and contributions\\nDocling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-clas...\"\n",
322+
" text: \"6 Future work and contributions\\nDocling is designed to allow easy extension of the model library and pipelines. In the future, we plan to extend Docling with several more models, such as a figure-classifier model, an equationrecognition model, a code-recognition model and more. This will help improve the quality of conversion for specific types of ...\"\n",
300323
" dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/76', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 5, 'bbox': {'l': 108.0, 't': 322.468994140625, 'r': 504.00299072265625, 'b': 259.0169982910156, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 543]}]}, {'self_ref': '#/texts/77', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 5, 'bbox': {'l': 108.0, 't': 251.6540069580078, 'r': 504.00299072265625, 'b': 198.99200439453125, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 402]}]}], 'headings': ['6 Future work and contributions'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n",
301-
" source: https://arxiv.org/pdf/2408.09869\n",
302-
"\n",
303-
"Source 4:\n",
304-
" text: \"3.3 Assembly\\nIn the final pipeline stage, Docling assembles all prediction results produced on each page into a well-defined datatype that encapsulates a converted document, as defined in the auxiliar...\"\n",
305-
" dl_meta: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/62', 'parent': {'$ref': '#/body'}, 'children': [], 'label': 'text', 'prov': [{'page_no': 4, 'bbox': {'l': 108.0, 't': 506.08099365234375, 'r': 504.00299072265625, 'b': 431.718994140625, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 622]}]}], 'headings': ['3.3 Assembly'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 11465328351749295394, 'filename': '2408.09869v5.pdf'}}\n",
306324
" source: https://arxiv.org/pdf/2408.09869\n"
307325
]
308326
}
309327
],
310328
"source": [
311-
"from langchain.chains import create_retrieval_chain\n",
312-
"from langchain.chains.combine_documents import create_stuff_documents_chain\n",
313-
"from langchain_huggingface import HuggingFaceEndpoint\n",
314-
"\n",
315-
"llm = HuggingFaceEndpoint(repo_id=GEN_MODEL_ID)\n",
316-
"\n",
317-
"\n",
318-
"def clip_text(text, threshold=100):\n",
319-
" return f\"{text[:threshold]}...\" if len(text) > threshold else text\n",
320-
"\n",
321-
"\n",
322-
"retriever = vectorstore.as_retriever()\n",
323329
"question_answer_chain = create_stuff_documents_chain(llm, PROMPT)\n",
324330
"rag_chain = create_retrieval_chain(retriever, question_answer_chain)\n",
325331
"resp_dict = rag_chain.invoke({\"input\": QUESTION})\n",
326332
"\n",
327-
"answer = clip_text(resp_dict[\"answer\"], threshold=200)\n",
328-
"print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{json.dumps(answer)}\")\n",
333+
"clipped_answer = clip_text(resp_dict[\"answer\"], threshold=200)\n",
334+
"print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n",
329335
"for i, doc in enumerate(resp_dict[\"context\"]):\n",
330336
" print()\n",
331337
" print(f\"Source {i+1}:\")\n",
332-
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=200))}\")\n",
338+
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
333339
" for key in doc.metadata:\n",
334340
" if key != \"pk\":\n",
335341
" val = doc.metadata.get(key)\n",
@@ -361,7 +367,7 @@
361367
"name": "python",
362368
"nbconvert_exporter": "python",
363369
"pygments_lexer": "ipython3",
364-
"version": "3.12.7"
370+
"version": "3.12.8"
365371
}
366372
},
367373
"nbformat": 4,

docling_langchain/__init__.py renamed to langchain_docling/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@
44
#
55
"""Docling LangChain package."""
66

7-
from docling_langchain.loader import DoclingLoader
7+
from langchain_docling.loader import DoclingLoader
File renamed without changes.
File renamed without changes.

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[tool.poetry]
2-
name = "docling-langchain"
2+
name = "langchain-docling"
33
version = "0.1.0" # DO NOT EDIT, updated automatically
44
description = "Docling LangChain integration"
55
authors = ["Panos Vagenas <[email protected]>"]
@@ -16,7 +16,7 @@ classifiers = [
1616
"Topic :: Scientific/Engineering :: Artificial Intelligence",
1717
"Programming Language :: Python :: 3"
1818
]
19-
packages = [{include = "docling_langchain"}]
19+
packages = [{include = "langchain_docling"}]
2020

2121
[tool.poetry.dependencies]
2222
python = ">=3.9,<3.13"

test/test_loader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from docling.chunking import HierarchicalChunker
66
from docling.datamodel.document import DoclingDocument as DLDocument
77

8-
from docling_langchain.loader import DoclingLoader, ExportType
8+
from langchain_docling.loader import DoclingLoader, ExportType
99

1010
in_json_str = json.dumps(
1111
{

0 commit comments

Comments
 (0)