Intro to dealing with multimodal data (#72)

AyushExel · prrao87 · web-flow · commit fd50ce3d8b8f · 2026-01-01T22:08:49.000-05:00
* blob example

* Cleanup

---------

Co-authored-by: prrao87 &lt;prrao87@gmail.com&gt;
diff --git a/docs/docs.json b/docs/docs.json
@@ -77,6 +77,7 @@
                 "pages": [
                   "tables/index",
                   "tables/create",
+                  "tables/multimodal",
                   "tables/schema",
                   "tables/update",
                   "tables/versioning",
diff --git a/docs/snippets/multimodal.mdx b/docs/snippets/multimodal.mdx
@@ -0,0 +1,18 @@
+{/* Auto-generated by scripts/mdx_snippets_gen.py. Do not edit manually. */}
+
+export const PyBlobApiIngest = "import lancedb\nimport lance\n\ndb = lancedb.connect(db_path_factory(\"blob_db\"))\n    \n# Create sample data\ndata = [\n    {\"id\": 1, \"video\": b\"fake_video_bytes_1\"},\n    {\"id\": 2, \"video\": b\"fake_video_bytes_2\"}\n]\n    \n# Create the table\ntbl = db.create_table(\"videos\", data=data, schema=schema)\n";
+
+export const PyBlobApiSchema = "import pyarrow as pa\n\n# Define schema with Blob API metadata for lazy loading\nschema = pa.schema([\n    pa.field(\"id\", pa.int64()),\n    pa.field(\n        \"video\", \n        pa.large_binary(), \n        metadata={\"lance-encoding:blob\": \"true\"} # Enable Blob API\n    ),\n])\n";
+
+export const PyCreateDummyData = "# Create some dummy images\ndef create_dummy_image(color):\n    img = Image.new('RGB', (100, 100), color=color)\n    buf = io.BytesIO()\n    img.save(buf, format='PNG')\n    return buf.getvalue()\n\n# Create dataset with metadata, vectors, and image blobs\ndata = [\n    {\n        \"id\": 1,\n        \"filename\": \"red_square.png\",\n        \"vector\": np.random.rand(128).astype(np.float32),\n        \"image_blob\": create_dummy_image('red'),\n        \"label\": \"red\"\n    },\n    {\n        \"id\": 2,\n        \"filename\": \"blue_square.png\",\n        \"vector\": np.random.rand(128).astype(np.float32),\n        \"image_blob\": create_dummy_image('blue'),\n        \"label\": \"blue\"\n    }\n]\n";
+
+export const PyDefineSchema = "# Define schema explictly to ensure image_blob is treated as binary\nschema = pa.schema([\n    pa.field(\"id\", pa.int32()),\n    pa.field(\"filename\", pa.string()),\n    pa.field(\"vector\", pa.list_(pa.float32(), 128)),\n    pa.field(\"image_blob\", pa.binary()), # Important: Use pa.binary() for blobs\n    pa.field(\"label\", pa.string())\n])\n";
+
+export const PyIngestData = "tbl = db.create_table(\"images\", data=data, schema=schema, mode=\"overwrite\")\n";
+
+export const PyMultimodalImports = "import lancedb\nimport pyarrow as pa\nimport pandas as pd\nimport numpy as np\nimport io\nfrom PIL import Image\n";
+
+export const PyProcessResults = "# Convert back to PIL Image\nfor _, row in results.iterrows():\n    image_bytes = row['image_blob']\n    image = Image.open(io.BytesIO(image_bytes))\n    print(f\"Retrieved image: {row['filename']}, Size: {image.size}\")\n    # You can now use 'image' with other libraries or display it\n";
+
+export const PySearchData = "# Search for similar images\nquery_vector = np.random.rand(128).astype(np.float32)\nresults = tbl.search(query_vector).limit(1).to_pandas()\n";
+
diff --git a/docs/snippets/quickstart.mdx b/docs/snippets/quickstart.mdx
@@ -14,6 +14,8 @@ export const PyQuickstartVectorSearch1 = "# Let's search for vectors similar to
 
 export const PyQuickstartVectorSearch2 = "# Let's search for vectors similar to \"wizard\"\nquery_vector = [0.7, 0.3, 0.5]\n\nresults = table.search(query_vector).limit(2).to_polars()\nprint(results)\n";
 
+export const TsQuickstartOutputPandas = "result = await table.search(queryVector).limit(2).toArray();\n";
+
 export const TsQuickstartAddData = "const moreData = [\n  { id: \"7\", text: \"mage\", vector: [0.6, 0.3, 0.4] },\n  { id: \"8\", text: \"bard\", vector: [0.3, 0.8, 0.4] },\n];\n\n// Add data to table\nawait table.add(moreData);\n";
 
 export const TsQuickstartCreateTable = "const data = [\n  { id: \"1\", text: \"knight\", vector: [0.9, 0.4, 0.8] },\n  { id: \"2\", text: \"ranger\", vector: [0.8, 0.4, 0.7] },\n  { id: \"9\", text: \"priest\", vector: [0.6, 0.2, 0.6] },\n  { id: \"4\", text: \"rogue\", vector: [0.7, 0.4, 0.7] },\n];\nlet table = await db.createTable(\"adventurers\", data, { mode: \"overwrite\" });\n";
@@ -24,8 +26,6 @@ export const TsQuickstartOpenTable = "table = await db.openTable(\"adventurers\"
 
 export const TsQuickstartOutputArray = "result = await table.search(queryVector).limit(2).toArray();\nconsole.table(result);\n";
 
-export const TsQuickstartOutputPandas = "result = await table.search(queryVector).limit(2).toArray();\n";
-
 export const TsQuickstartVectorSearch1 = "// Let's search for vectors similar to \"warrior\"\nlet queryVector = [0.8, 0.3, 0.8];\n\nlet result = await table.search(queryVector).limit(2).toArray();\nconsole.table(result);\n";
 
 export const TsQuickstartVectorSearch2 = "// Let's search for vectors similar to \"wizard\"\nqueryVector = [0.7, 0.3, 0.5];\n\nconst results = await table.search(queryVector).limit(2).toArray();\nconsole.table(results);\n";
diff --git a/docs/snippets/search.mdx b/docs/snippets/search.mdx
@@ -8,10 +8,10 @@ export const PyBasicHybridSearch = "data = [\n    {\"text\": \"rebel spaceships
 
 export const PyBasicHybridSearchAsync = "uri = \"data/sample-lancedb\"\nasync_db = await lancedb.connect_async(uri)\ndata = [\n    {\"text\": \"rebel spaceships striking from a hidden base\"},\n    {\"text\": \"have won their first victory against the evil Galactic Empire\"},\n    {\"text\": \"during the battle rebel spies managed to steal secret plans\"},\n    {\"text\": \"to the Empire's ultimate weapon the Death Star\"},\n]\nasync_tbl = await async_db.create_table(\"documents_async\", schema=Documents)\n# ingest docs with auto-vectorization\nawait async_tbl.add(data)\n# Create a fts index before the hybrid search\nawait async_tbl.create_index(\"text\", config=FTS())\ntext_query = \"flower moon\"\n# hybrid search with default re-ranker\nawait (await async_tbl.search(\"flower moon\", query_type=\"hybrid\")).to_pandas()\n";
 
-export const PyClassDefinition = "class Metadata(BaseModel):\n    source: str\n    timestamp: datetime\n\n\nclass Document(BaseModel):\n    content: str\n    meta: Metadata\n\n\nclass LanceSchema(LanceModel):\n    id: str\n    vector: Vector(1536)\n    payload: Document\n";
-
 export const PyClassDocuments = "class Documents(LanceModel):\n    vector: Vector(embeddings.ndims()) = embeddings.VectorField()\n    text: str = embeddings.SourceField()\n";
 
+export const PyClassDefinition = "class Metadata(BaseModel):\n    source: str\n    timestamp: datetime\n\n\nclass Document(BaseModel):\n    content: str\n    meta: Metadata\n\n\nclass LanceSchema(LanceModel):\n    id: str\n    vector: Vector(1536)\n    payload: Document\n";
+
 export const PyCreateTableAsyncWithNestedSchema = "# Let's add 100 sample rows to our dataset\ndata = [\n    LanceSchema(\n        id=f\"id{i}\",\n        vector=np.random.randn(1536),\n        payload=Document(\n            content=f\"document{i}\",\n            meta=Metadata(source=f\"source{i % 10}\", timestamp=datetime.now()),\n        ),\n    )\n    for i in range(100)\n]\n\nasync_tbl = await async_db.create_table(\"documents_async\", data=data)\n";
 
 export const PyCreateTableWithNestedSchema = "# Let's add 100 sample rows to our dataset\ndata = [\n    LanceSchema(\n        id=f\"id{i}\",\n        vector=np.random.randn(1536),\n        payload=Document(\n            content=f\"document{i}\",\n            meta=Metadata(source=f\"source{i % 10}\", timestamp=datetime.now()),\n        ),\n    )\n    for i in range(100)\n]\n\n# Synchronous client\ntbl = db.create_table(\"documents\", data=data)\n";
diff --git a/docs/tables/multimodal.mdx b/docs/tables/multimodal.mdx
@@ -0,0 +1,123 @@
+---
+title: Multimodal Data (Blobs)
+sidebarTitle: "Working with multimodal data"
+description: Learn how to store and query multimodal data (images, audio, video) directly in LanceDB using binary columns.
+icon: "images"
+keywords: ["blob", "large binary", "blobs", "multimodal"]
+---
+
+import {
+    PyMultimodalImports as MultimodalImports,
+    PyCreateDummyData as CreateDummyData,
+    PyDefineSchema as DefineSchema,
+    PyIngestData as IngestData,
+    PySearchData as SearchData,
+    PyProcessResults as ProcessResults,
+    PyBlobApiSchema as BlobApiSchema,
+    PyBlobApiIngest as BlobApiIngest,
+} from '/snippets/multimodal.mdx';
+
+LanceDB handles multimodal data—images, audio, video, and PDF files—natively by storing the raw bytes in a binary column alongside your vectors and metadata. This approach simplifies your data infrastructure by keeping the raw assets and their embeddings in the same database, eliminating the need for separate object storage for many use cases.
+
+This guide demonstrates how to ingest, store, and retrieve image data using standard binary columns, and also introduces the **Lance Blob API** for optimized handling of larger multimodal files.
+
+## Storing binary data
+
+To store binary data, you need to use the `pa.binary()` data type in your Arrow schema. In Python, this corresponds to `bytes` objects if you're using LanceDB's Pydantic `LanceModel` to define the schema.
+
+### 1. Setup and imports
+
+First, let's import the necessary libraries. We'll use `PIL` (Pillow) for image handling and `io` for byte conversion.
+
+<CodeGroup>
+    <CodeBlock filename="Python" language="Python" icon="python">
+    {MultimodalImports}
+    </CodeBlock>
+</CodeGroup>
+
+### 2. Preparing data
+
+For this example, we'll create some dummy in-memory images. In a real application, you would read these from files or an API. The key is to convert your data (image, audio, etc.) into a raw `bytes` object.
+
+<CodeGroup>
+    <CodeBlock filename="Python" language="Python" icon="python">
+    {CreateDummyData}
+    </CodeBlock>
+</CodeGroup>
+
+### 3. Defining the schema
+
+When creating the table, it is **highly recommended** to define the schema explicitly. This ensures that your binary data is correctly interpreted as a `binary` type by Arrow/LanceDB and not as a generic string or list.
+
+<CodeGroup>
+    <CodeBlock filename="Python" language="Python" icon="python">
+    {DefineSchema}
+    </CodeBlock>
+</CodeGroup>
+
+### 4. Ingesting data
+
+Now, create the table using the data and the defined schema.
+
+<CodeGroup>
+    <CodeBlock filename="Python" language="Python" icon="python">
+    {IngestData}
+    </CodeBlock>
+</CodeGroup>
+
+## Retrieving and using blobs
+
+When you search your LanceDB table, you can retrieve the binary column just like any other metadata.
+
+<CodeGroup>
+    <CodeBlock filename="Python" language="Python" icon="python">
+    {SearchData}
+    </CodeBlock>
+</CodeGroup>
+
+### Converting bytes back to objects
+
+Once you have the `bytes` data back from the search result, you can decode it back into its original format (e.g., a PIL Image, an Audio buffer, etc.).
+
+<CodeGroup>
+    <CodeBlock filename="Python" language="Python" icon="python">
+    {ProcessResults}
+    </CodeBlock>
+</CodeGroup>
+
+## Large Blobs (Blob API)
+
+For larger files like high-resolution images or videos, Lance provides a specialized **Blob API**. By using `pa.large_binary()` and specific metadata, you enable **lazy loading** and optimized encoding. This allows you to work with massive datasets without loading all binary data into memory upfront.
+
+### 1. Defining a blob schema
+
+To use the Blob API, you must mark the column with `{"lance-encoding:blob": "true"}` metadata.
+
+<CodeGroup>
+    <CodeBlock filename="Python" language="Python" icon="python">
+    {BlobApiSchema}
+    </CodeBlock>
+</CodeGroup>
+
+### 2. Ingesting large blobs
+
+You can then ingest data normally, and Lance will handle the optimized storage.
+
+<CodeGroup>
+    <CodeBlock filename="Python" language="Python" icon="python">
+    {BlobApiIngest}
+    </CodeBlock>
+</CodeGroup>
+
+<Card>
+For more advanced usage, including random access and file-like reading of blobs, see the
+Lance format's [blob API documentation](https://lance.org/guide/blob/).
+</Card>
+
+## Other modalities
+
+The `pa.binary()` and `pa.large_binary()` types are universal. You can use this same pattern for other types of multimodal data:
+
+- **Audio:** Read `.wav` or `.mp3` files as bytes.
+- **Video:** Store video transitions or full clips using the Blob API.
+- **PDFs/Documents:** Store the raw file content for document search.
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,4 +11,5 @@ dependencies = [
     "pydantic>=2.12.4",
     "pytest>=9.0.1",
     "pytest-asyncio>=1.3.0",
+    "Pillow>=11.0.0",
 ]
diff --git a/tests/py/test_multimodal.py b/tests/py/test_multimodal.py
@@ -0,0 +1,123 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright The LanceDB Authors
+
+import pytest
+try:
+    import lancedb
+    import numpy as np
+    import pyarrow as pa
+    import io
+    from PIL import Image
+except ImportError:
+    pass
+
+# --8<-- [start:multimodal_imports]
+import lancedb
+import pyarrow as pa
+import pandas as pd
+import numpy as np
+import io
+from PIL import Image
+# --8<-- [end:multimodal_imports]
+
+def test_multimodal_ingestion(db_path_factory):
+    # Ensure dependencies are available
+    pytest.importorskip("PIL")
+    pytest.importorskip("lancedb")
+    pytest.importorskip("numpy")
+
+    # --8<-- [start:create_dummy_data]
+    # Create some dummy images
+    def create_dummy_image(color):
+        img = Image.new('RGB', (100, 100), color=color)
+        buf = io.BytesIO()
+        img.save(buf, format='PNG')
+        return buf.getvalue()
+
+    # Create dataset with metadata, vectors, and image blobs
+    data = [
+        {
+            "id": 1,
+            "filename": "red_square.png",
+            "vector": np.random.rand(128).astype(np.float32),
+            "image_blob": create_dummy_image('red'),
+            "label": "red"
+        },
+        {
+            "id": 2,
+            "filename": "blue_square.png",
+            "vector": np.random.rand(128).astype(np.float32),
+            "image_blob": create_dummy_image('blue'),
+            "label": "blue"
+        }
+    ]
+    # --8<-- [end:create_dummy_data]
+
+    # --8<-- [start:define_schema]
+    # Define schema explictly to ensure image_blob is treated as binary
+    schema = pa.schema([
+        pa.field("id", pa.int32()),
+        pa.field("filename", pa.string()),
+        pa.field("vector", pa.list_(pa.float32(), 128)),
+        pa.field("image_blob", pa.binary()), # Important: Use pa.binary() for blobs
+        pa.field("label", pa.string())
+    ])
+    # --8<-- [end:define_schema]
+
+    db_uri = db_path_factory("multimodal_db")
+    db = lancedb.connect(db_uri)
+
+    # --8<-- [start:ingest_data]
+    tbl = db.create_table("images", data=data, schema=schema, mode="overwrite")
+    # --8<-- [end:ingest_data]
+   
+    assert len(tbl) == 2
+
+    # --8<-- [start:search_data]
+    # Search for similar images
+    query_vector = np.random.rand(128).astype(np.float32)
+    results = tbl.search(query_vector).limit(1).to_pandas()
+    # --8<-- [end:search_data]
+
+    # --8<-- [start:process_results]
+    # Convert back to PIL Image
+    for _, row in results.iterrows():
+        image_bytes = row['image_blob']
+        image = Image.open(io.BytesIO(image_bytes))
+        print(f"Retrieved image: {row['filename']}, Size: {image.size}")
+        # You can now use 'image' with other libraries or display it
+    # --8<-- [end:process_results]
+   
+    assert len(results) == 1
+
+def test_blob_api_definition(db_path_factory):
+    # --8<-- [start:blob_api_schema]
+    import pyarrow as pa
+
+    # Define schema with Blob API metadata for lazy loading
+    schema = pa.schema([
+        pa.field("id", pa.int64()),
+        pa.field(
+            "video", 
+            pa.large_binary(), 
+            metadata={"lance-encoding:blob": "true"} # Enable Blob API
+        ),
+    ])
+    # --8<-- [end:blob_api_schema]
+
+    # --8<-- [start:blob_api_ingest]
+    import lancedb
+    import lance
+
+    db = lancedb.connect(db_path_factory("blob_db"))
+    
+    # Create sample data
+    data = [
+        {"id": 1, "video": b"fake_video_bytes_1"},
+        {"id": 2, "video": b"fake_video_bytes_2"}
+    ]
+    
+    # Create the table
+    tbl = db.create_table("videos", data=data, schema=schema)
+    # --8<-- [end:blob_api_ingest]
+    assert len(tbl) == 2
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -11,4 +11,5 @@ dependencies = [`
`11`	`11`	`"pydantic>=2.12.4",`
`12`	`12`	`"pytest>=9.0.1",`
`13`	`13`	`"pytest-asyncio>=1.3.0",`
	`14`	`+ "Pillow>=11.0.0",`
`14`	`15`	`]`