run-llama · joshuaFordyce · Oct 7, 2025 · Oct 9, 2025 · Oct 20, 2025 · Oct 21, 2025
diff --git a/llama-index-integrations/readers/llama-index-readers-trino/.gitignore b/llama-index-integrations/readers/llama-index-readers-trino/.gitignore
@@ -0,0 +1,20 @@
+# Python
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+.ipynb_checkpoints/
+venv/
+.env
+
+# LlamaIndex/Poetry
+.venv/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+dist/
+build/
+*.egg-info
+htmlcov/
+tmp/
diff --git a/llama-index-integrations/readers/llama-index-readers-trino/CHANGELOG.md b/llama-index-integrations/readers/llama-index-readers-trino/CHANGELOG.md
@@ -0,0 +1,15 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [0.1.0] - YYYY-MM-DD
+
+### Added
+
+- Initial implementation of the `TrinoReader`.
+- Basic connection, query execution, and document transformation logic.
+- Setup files: `pyproject.toml`, `Makefile`, `README.md`, `LICENSE`, and `.gitignore`.
+- Unit tests with mocking for Trino connection.
diff --git a/llama-index-integrations/readers/llama-index-readers-trino/LICENSE b/llama-index-integrations/readers/llama-index-readers-trino/LICENSE
@@ -0,0 +1,26 @@
+## 5. `LICENSE` (MIT License)
+
+This is a permissive and common open-source license.
+
+```text
+MIT License
+
+Copyright (c) [Year] [Your Name]
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/llama-index-integrations/readers/llama-index-readers-trino/Makefile b/llama-index-integrations/readers/llama-index-readers-trino/Makefile
@@ -0,0 +1,26 @@
+.PHONY: install setup lint format test mypy clean
+
+# Setup and Dependencies
+setup:
+	poetry install --with dev
+
+# Linting and Formatting
+lint:
+	ruff check .
+format:
+	ruff check . --fix
+	ruff format .
+
+# Testing and Typing
+test:
+	pytest tests/
+mypy:
+	mypy .
+
+# Clean up build artifacts
+clean:
+	find . -name "__pycache__" -exec rm -rf {} +
+	rm -rf .pytest_cache
+	rm -rf .mypy_cache
+	rm -rf dist
+	rm -rf *.egg-info
diff --git a/llama-index-integrations/readers/llama-index-readers-trino/README.md b/llama-index-integrations/readers/llama-index-readers-trino/README.md
@@ -0,0 +1,65 @@
+TrinoReader is a custom Data Loader designed to solve the problem of robust data ingestion for LlamaiNdex RAG pipelines on a Trino Data lake
+
+## Quick Start Guide
+
+### Installation
+
+Install the required Python packages, including the core reader and the native `trino` client:
+
+```bash
+pip install llama-index-core trino pandas
+```
+
+### Usage Example: Data Ingestion
+
+The TrinoReader is instantiated with standard connection parameters and uses the load_data method to execute a query and retrieve documents ready for indexing.
+
+```Python
+import logging
+from llama_index.core.schema import Document
+from your_module import TrinoReader # Assumed import
+
+# 1. Define the SQL Query (Explicitly list columns for best practice)
+query_to_index = """
+SELECT c_custkey, c_name, c_acctbal
+FROM tpch.tiny.customer
+WHERE c_nationkey = 1
+LIMIT 5;
+"""
+
+# 2. Instantiate the Reader
+trino_data_loader = TrinoReader(
+    host="localhost",
+    port=8080,
+    user="rag_user",
+    catalog="tpch",
+    schema="tiny"
+)
+
+# 3. Execute the Ingestion
+print(f"Executing query on Trino...")
+try:
+    documents: List[Document] = trino_data_loader.load_data(query=query_to_index)
+
+    # 4. Verification: Inspect the RAG-ready Document
+    if documents:
+        print(f"Successfully loaded {len(documents)} documents.")
+        print("\n--- Example Document (High-Density Context) ---")
+        print(f"Text Content: {documents[0].text}")
+        print(f"Metadata: {documents[0].metadata}")
+        print("------------------------------------------")
+
+except Exception as e:
+    logging.error(f"FATAL: Data loading failed: {e}")
+
+```
+
+### Contributing
+
+This is an open-source project. If you have any suggestions for improvement, or would like to contribute a fix, please feel free to submit a pull request.
+
+### Focus Areas for Contribution:
+
+Implementing the lazy_load_data generator for memory-efficient streaming of massive tables.
+
+Adding support for advanced Trino authentication methods (Kerberos, JWT).
diff --git a/llama-index-integrations/readers/llama-index-readers-trino/llama_index/base.py b/llama-index-integrations/readers/llama-index-readers-trino/llama_index/base.py
@@ -0,0 +1,154 @@
+from typing import Dict, List, Any, Tuple
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+import trino
+import logging
+
+logger = logging.getLogger(__name__)
+# You will use the trino-python-client here
+# import trino  # or whatever alias you use
+
+
+class TrinoReader(BaseReader):
+    """
+
+    Trino database reader.
+
+    Loads data from a Trino cluster into Document used by LlamaIndex.
+
+    Args:
+        host (str): server that's running Trino
+        schema (str): reside within a catalog and serve as a wa to organize tables and other database objects
+        port (int): network port number used for communication with a Trino cluster
+        catalog (str): A catalog in trino specifies a connector
+
+
+
+    """
+
+    def __init__(
+        self,
+        user: str,
+        schema: str,
+        host: str,
+        port: int = 8080,
+        catalog: str = "hive",
+        **kwargs: Any,
+    ) -> None:
+        """Initialize with Trino connection parameters."""
+        # Store connection parameters (self.host, self.port, etc.)
+        # self.conn_params = {...}
+
+        self.host = host
+        self.port = port
+        self.catalog = catalog
+        self.user = user
+        self.schema = schema
+        self._conn = None
+        self._cursor = None
+
+        self._conn_paramse = {
+            "host": host,
+            "port": port,
+            "catalog": catalog,
+            "user": user,
+            "schema": schema,
+        }
+
+    def configureConnection(self) -> Tuple[trino.dbapi.Connection, trino.dbapi.Cursor]:
+        """
+        Configure Connection for Trino Datawarehouse
+        """
+        if self._conn is None or self._conn.closed:
+            try:
+                self._conn = trino.dbapi.connect(
+                    host=self.host,
+                    port=self.port,
+                    catalog=self.catalog,
+                    user=self.user,
+                    schema=self.schema,
+                )
+
+                self._cursor = self._conn.cursor()
+            except trino.dbapi.DatabaseError as e:
+                print(f"Trino connection failed:{e}")
+                raise
+        return self._conn, self._cursor
+
+    def execute_query(
+        self, query: str, conn: trino.dbapi.Connection, cur: trino.dbapi.Cursor
+    ):
+        """
+        Executes Query againg Trino instance
+
+        Args:
+            query (str): The SQL++ query to execute.
+            conn (trino.dbapi.Connection) The trino connection used to build the cursor
+            cursor (trino.dbapi.Cursor) an Object used to execute SQL queries against Trino
+
+        """
+        try:
+            cur.execute(query)
+
+            rows = cur.fetchall()
+            return [rows, cur.description]
+        except trino.dbapi.DatabaseError as e:
+            print(f"Trino connection failed: {e}")
+            raise
+
+    def load_data(self, query: str) -> List[Document]:
+        """
+        Loads data from Trino by executing a single SQL query.
+
+        Args:
+            query: The SQL query to execute against the Trino cluster.
+
+        """
+        all_documents = []
+
+        conn = None
+        cur = None
+        try:
+            conn, cur = self.configureConnection()
+            if not conn or not cur:
+                logger.warning("Could not establish connection; returning empty list")
+                return []
+            # 1. Connect to Trino using self.conn_params
+
+            results = self.execute_query(query, conn, cur)
+
+            column_names = [desc[0] for desc in results[1]]
+
+            if results[0] is None:
+                return []
+
+            # 3 Document Transformation
+            for row_index, row in enumerate(results[0]):
+                # Ensure all elements in row are cast to str before joining for content
+                content = ", ".join(
+                    f"{name}: {value}" for name, value in zip(column_names, row)
+                )
+
+                # Metadata must be a mapping (Dict[str, Any])
+                metadata: Dict[str, Any] = dict(zip(column_names, row))
+                metadata.update(
+                    {
+                        "source": "raw_data",
+                        "catalog": self.catalog,
+                        "schema": self.schema,
+                        "row_id": row_index,
+                    }
+                )
+
+                all_documents.append(Document(text=content, metadata=metadata))
+
+            return all_documents
+
+        except Exception as e:
+            logger.error(f"FATAL ERROR during data loading: {e}")
+            raise
+        finally:
+            if cur:
+                cur.close()
+            if conn:
+                conn.close()
diff --git a/llama-index-integrations/readers/llama-index-readers-trino/llama_index/init.py b/llama-index-integrations/readers/llama-index-readers-trino/llama_index/init.py
diff --git a/llama-index-integrations/readers/llama-index-readers-trino/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-trino/pyproject.toml
@@ -0,0 +1,76 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[dependency-groups]
+dev = [
+    "ipython==8.10.0",
+    "jupyter>=1.0.0,<2",
+    "mypy==0.991",
+    "pre-commit==3.2.0",
+    "pylint==2.15.10",
+    "pytest==7.2.1",
+    "pytest-mock==3.11.1",
+    "ruff==0.11.11",
+    "types-Deprecated>=0.1.0",
+    "types-PyYAML>=6.0.12.12,<7",
+    "types-protobuf>=4.24.0.4,<5",
+    "types-redis==4.5.5.0",
+    "types-requests==2.28.11.8",
+    "types-setuptools==67.1.0.0",
+    "black[jupyter]<=23.9.1,>=23.7.0",
+    "codespell[toml]>=v2.2.6",
+    "diff-cover>=9.2.0",
+    "pytest-cov>=6.1.1",
+    # Specific requirement for testing your Trino code:
+    "trino>=0.316.0",
+    "pandas>=2.0.0",
+]
+
+[project]
+name = "llama-index-readers-trino"
+version = "0.1.0"  # Starting version for a new integration
+description = "LlamaIndex Data Loader for Trino distributed SQL query engine."
+authors = [{name = "Joshua Fordyce", email = "[email protected]"}]
+requires-python = ">=3.9,<4.0"
+readme = "README.md"
+license = "MIT"
+maintainers = [{name = "Joshua Fordyce"}]
+keywords = [
+    "trino",
+    "distributed query",
+    "data lakehouse",
+    "data federation",
+    "sql",
+]
+# CRITICAL: Add the native Trino Python client as a dependency
+# Note: We use the llama-index-core version from the source template
+dependencies = ["llama-index-core>=0.13.0,<0.15", "trino>=0.316.0"]
+
+[tool.codespell]
+check-filenames = true
+check-hidden = true
+skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"
+
+[tool.hatch.build.targets.sdist]
+include = ["llama_index/"]
+exclude = ["**/BUILD"]
+
+[tool.hatch.build.targets.wheel]
+include = ["llama_index/"]
+exclude = ["**/BUILD"]
+
+[tool.llamahub]
+# This path needs to reflect the final location of your Trino reader module
+contains_example = false
+import_path = "llama_index.readers.trino"
+
+[tool.llamahub.class_authors]
+# Your new class name
+TrinoReader = "Joshua Fordyce"
+
+[tool.mypy]
+disallow_untyped_defs = true
+exclude = ["_static", "build", "examples", "notebooks", "venv"]
+ignore_missing_imports = true
+python_version = "3.9"  # Updated to align with your project's min Python version
diff --git a/llama-index-integrations/readers/llama-index-readers-trino/tests/__init__.py b/llama-index-integrations/readers/llama-index-readers-trino/tests/__init__.py