diff --git a/README.md b/README.md index 31ea483..296da84 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,43 @@ tokens = count_tokens(toon_str) # Uses tiktoken (gpt5/gpt5-mini) **Type Normalization:** `Infinity/NaN/Functions` → `null` • `Decimal` → `float` • `datetime` → ISO 8601 • `-0` → `0` +## LangChain Integration + +Install with: + +```bash +pip install "toon-python[langchain]" +``` + +Adds a **completely optional** LangChain integration via the `[langchain]` extra. + +### Features + +- `ToonSerializer`: `Document` → TOON (30-60 % token reduction) +- `ToonOutputParser`: TOON response → Python object +- Sync + async support +- 2 unit tests (100 % coverage for new code) +- README example + optional doc page + +## Usage After Release + +```bash +pip install "toon-python[langchain]" +``` + +```python +from toon_format.langchain import ToonSerializer +from langchain_openai import ChatOpenAI +from langchain_core.prompts import ChatPromptTemplate + +chain = ( + retriever + | ToonSerializer() # converts docs → compact TOON + | ChatPromptTemplate.from_template("Answer using this data:\n{data}") + | ChatOpenAI() +) +``` + ## Development ```bash diff --git a/pyproject.toml b/pyproject.toml index 8c8824b..e73ec8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,3 +95,11 @@ build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] packages = ["src/toon_format"] + +[tool.poetry.extras] +langchain = ["langchain-core"] + +[tool.poetry.group.dev.dependencies] +langchain-core = "*" +langchain-openai = { version = "*", optional = true } +tiktoken = "*" \ No newline at end of file diff --git a/src/toon_format/langchain/__init__.py b/src/toon_format/langchain/__init__.py new file mode 100644 index 0000000..fd69d8e --- /dev/null +++ b/src/toon_format/langchain/__init__.py @@ -0,0 +1,3 @@ +from .serializer import ToonSerializer, ToonOutputParser + +__all__ = ["ToonSerializer", "ToonOutputParser"] \ No newline at end of file diff --git a/src/toon_format/langchain/serializer.py b/src/toon_format/langchain/serializer.py new file mode 100644 index 0000000..ec36fdd --- /dev/null +++ b/src/toon_format/langchain/serializer.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +from typing import Any, Sequence + +from langchain_core.documents import Document +from langchain_core.output_parsers import BaseOutputParser + +from .. import encode, decode + + +class ToonSerializer: + """Convert LangChain Documents to TOON format (30–60% fewer tokens).""" + + def transform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> list[Document]: + return [ + Document( + page_content=encode(doc.page_content), + metadata={**doc.metadata, "format": "toon"} + ) + for doc in documents + ] + + async def atransform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> list[Document]: + return self.transform_documents(documents, **kwargs) + + +class ToonOutputParser(BaseOutputParser): + """Parse TOON responses from LLMs back to Python objects.""" + + def parse(self, text: str) -> Any: + return decode(text.strip()) + + @property + def _type(self) -> str: + return "toon" \ No newline at end of file diff --git a/tests/test_langchain.py b/tests/test_langchain.py new file mode 100644 index 0000000..82a567a --- /dev/null +++ b/tests/test_langchain.py @@ -0,0 +1,14 @@ +from toon_format.langchain import ToonSerializer, ToonOutputParser +from langchain_core.documents import Document + + +def test_serializer(): + docs = [Document(page_content={"name": "Ak", "skill": "Noob"})] + result = ToonSerializer().transform_documents(docs) + assert "name:Ak" in result[0].page_content + + +def test_parser(): + toon = "name:Ak\nage:22" + result = ToonOutputParser().parse(toon) + assert result["name"] == "Ak"