Skip to content

Commit fd50ce3

Browse files
AyushExelprrao87
andauthored
Intro to dealing with multimodal data (#72)
* blob example * Cleanup --------- Co-authored-by: prrao87 <prrao87@gmail.com>
1 parent 1e3e4b4 commit fd50ce3

File tree

8 files changed

+538
-212
lines changed

8 files changed

+538
-212
lines changed

docs/docs.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@
7777
"pages": [
7878
"tables/index",
7979
"tables/create",
80+
"tables/multimodal",
8081
"tables/schema",
8182
"tables/update",
8283
"tables/versioning",

docs/snippets/multimodal.mdx

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{/* Auto-generated by scripts/mdx_snippets_gen.py. Do not edit manually. */}
2+
3+
export const PyBlobApiIngest = "import lancedb\nimport lance\n\ndb = lancedb.connect(db_path_factory(\"blob_db\"))\n \n# Create sample data\ndata = [\n {\"id\": 1, \"video\": b\"fake_video_bytes_1\"},\n {\"id\": 2, \"video\": b\"fake_video_bytes_2\"}\n]\n \n# Create the table\ntbl = db.create_table(\"videos\", data=data, schema=schema)\n";
4+
5+
export const PyBlobApiSchema = "import pyarrow as pa\n\n# Define schema with Blob API metadata for lazy loading\nschema = pa.schema([\n pa.field(\"id\", pa.int64()),\n pa.field(\n \"video\", \n pa.large_binary(), \n metadata={\"lance-encoding:blob\": \"true\"} # Enable Blob API\n ),\n])\n";
6+
7+
export const PyCreateDummyData = "# Create some dummy images\ndef create_dummy_image(color):\n img = Image.new('RGB', (100, 100), color=color)\n buf = io.BytesIO()\n img.save(buf, format='PNG')\n return buf.getvalue()\n\n# Create dataset with metadata, vectors, and image blobs\ndata = [\n {\n \"id\": 1,\n \"filename\": \"red_square.png\",\n \"vector\": np.random.rand(128).astype(np.float32),\n \"image_blob\": create_dummy_image('red'),\n \"label\": \"red\"\n },\n {\n \"id\": 2,\n \"filename\": \"blue_square.png\",\n \"vector\": np.random.rand(128).astype(np.float32),\n \"image_blob\": create_dummy_image('blue'),\n \"label\": \"blue\"\n }\n]\n";
8+
9+
export const PyDefineSchema = "# Define schema explictly to ensure image_blob is treated as binary\nschema = pa.schema([\n pa.field(\"id\", pa.int32()),\n pa.field(\"filename\", pa.string()),\n pa.field(\"vector\", pa.list_(pa.float32(), 128)),\n pa.field(\"image_blob\", pa.binary()), # Important: Use pa.binary() for blobs\n pa.field(\"label\", pa.string())\n])\n";
10+
11+
export const PyIngestData = "tbl = db.create_table(\"images\", data=data, schema=schema, mode=\"overwrite\")\n";
12+
13+
export const PyMultimodalImports = "import lancedb\nimport pyarrow as pa\nimport pandas as pd\nimport numpy as np\nimport io\nfrom PIL import Image\n";
14+
15+
export const PyProcessResults = "# Convert back to PIL Image\nfor _, row in results.iterrows():\n image_bytes = row['image_blob']\n image = Image.open(io.BytesIO(image_bytes))\n print(f\"Retrieved image: {row['filename']}, Size: {image.size}\")\n # You can now use 'image' with other libraries or display it\n";
16+
17+
export const PySearchData = "# Search for similar images\nquery_vector = np.random.rand(128).astype(np.float32)\nresults = tbl.search(query_vector).limit(1).to_pandas()\n";
18+

docs/snippets/quickstart.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ export const PyQuickstartVectorSearch1 = "# Let's search for vectors similar to
1414

1515
export const PyQuickstartVectorSearch2 = "# Let's search for vectors similar to \"wizard\"\nquery_vector = [0.7, 0.3, 0.5]\n\nresults = table.search(query_vector).limit(2).to_polars()\nprint(results)\n";
1616

17+
export const TsQuickstartOutputPandas = "result = await table.search(queryVector).limit(2).toArray();\n";
18+
1719
export const TsQuickstartAddData = "const moreData = [\n { id: \"7\", text: \"mage\", vector: [0.6, 0.3, 0.4] },\n { id: \"8\", text: \"bard\", vector: [0.3, 0.8, 0.4] },\n];\n\n// Add data to table\nawait table.add(moreData);\n";
1820

1921
export const TsQuickstartCreateTable = "const data = [\n { id: \"1\", text: \"knight\", vector: [0.9, 0.4, 0.8] },\n { id: \"2\", text: \"ranger\", vector: [0.8, 0.4, 0.7] },\n { id: \"9\", text: \"priest\", vector: [0.6, 0.2, 0.6] },\n { id: \"4\", text: \"rogue\", vector: [0.7, 0.4, 0.7] },\n];\nlet table = await db.createTable(\"adventurers\", data, { mode: \"overwrite\" });\n";
@@ -24,8 +26,6 @@ export const TsQuickstartOpenTable = "table = await db.openTable(\"adventurers\"
2426

2527
export const TsQuickstartOutputArray = "result = await table.search(queryVector).limit(2).toArray();\nconsole.table(result);\n";
2628

27-
export const TsQuickstartOutputPandas = "result = await table.search(queryVector).limit(2).toArray();\n";
28-
2929
export const TsQuickstartVectorSearch1 = "// Let's search for vectors similar to \"warrior\"\nlet queryVector = [0.8, 0.3, 0.8];\n\nlet result = await table.search(queryVector).limit(2).toArray();\nconsole.table(result);\n";
3030

3131
export const TsQuickstartVectorSearch2 = "// Let's search for vectors similar to \"wizard\"\nqueryVector = [0.7, 0.3, 0.5];\n\nconst results = await table.search(queryVector).limit(2).toArray();\nconsole.table(results);\n";

docs/snippets/search.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@ export const PyBasicHybridSearch = "data = [\n {\"text\": \"rebel spaceships
88

99
export const PyBasicHybridSearchAsync = "uri = \"data/sample-lancedb\"\nasync_db = await lancedb.connect_async(uri)\ndata = [\n {\"text\": \"rebel spaceships striking from a hidden base\"},\n {\"text\": \"have won their first victory against the evil Galactic Empire\"},\n {\"text\": \"during the battle rebel spies managed to steal secret plans\"},\n {\"text\": \"to the Empire's ultimate weapon the Death Star\"},\n]\nasync_tbl = await async_db.create_table(\"documents_async\", schema=Documents)\n# ingest docs with auto-vectorization\nawait async_tbl.add(data)\n# Create a fts index before the hybrid search\nawait async_tbl.create_index(\"text\", config=FTS())\ntext_query = \"flower moon\"\n# hybrid search with default re-ranker\nawait (await async_tbl.search(\"flower moon\", query_type=\"hybrid\")).to_pandas()\n";
1010

11-
export const PyClassDefinition = "class Metadata(BaseModel):\n source: str\n timestamp: datetime\n\n\nclass Document(BaseModel):\n content: str\n meta: Metadata\n\n\nclass LanceSchema(LanceModel):\n id: str\n vector: Vector(1536)\n payload: Document\n";
12-
1311
export const PyClassDocuments = "class Documents(LanceModel):\n vector: Vector(embeddings.ndims()) = embeddings.VectorField()\n text: str = embeddings.SourceField()\n";
1412

13+
export const PyClassDefinition = "class Metadata(BaseModel):\n source: str\n timestamp: datetime\n\n\nclass Document(BaseModel):\n content: str\n meta: Metadata\n\n\nclass LanceSchema(LanceModel):\n id: str\n vector: Vector(1536)\n payload: Document\n";
14+
1515
export const PyCreateTableAsyncWithNestedSchema = "# Let's add 100 sample rows to our dataset\ndata = [\n LanceSchema(\n id=f\"id{i}\",\n vector=np.random.randn(1536),\n payload=Document(\n content=f\"document{i}\",\n meta=Metadata(source=f\"source{i % 10}\", timestamp=datetime.now()),\n ),\n )\n for i in range(100)\n]\n\nasync_tbl = await async_db.create_table(\"documents_async\", data=data)\n";
1616

1717
export const PyCreateTableWithNestedSchema = "# Let's add 100 sample rows to our dataset\ndata = [\n LanceSchema(\n id=f\"id{i}\",\n vector=np.random.randn(1536),\n payload=Document(\n content=f\"document{i}\",\n meta=Metadata(source=f\"source{i % 10}\", timestamp=datetime.now()),\n ),\n )\n for i in range(100)\n]\n\n# Synchronous client\ntbl = db.create_table(\"documents\", data=data)\n";

docs/tables/multimodal.mdx

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
---
2+
title: Multimodal Data (Blobs)
3+
sidebarTitle: "Working with multimodal data"
4+
description: Learn how to store and query multimodal data (images, audio, video) directly in LanceDB using binary columns.
5+
icon: "images"
6+
keywords: ["blob", "large binary", "blobs", "multimodal"]
7+
---
8+
9+
import {
10+
PyMultimodalImports as MultimodalImports,
11+
PyCreateDummyData as CreateDummyData,
12+
PyDefineSchema as DefineSchema,
13+
PyIngestData as IngestData,
14+
PySearchData as SearchData,
15+
PyProcessResults as ProcessResults,
16+
PyBlobApiSchema as BlobApiSchema,
17+
PyBlobApiIngest as BlobApiIngest,
18+
} from '/snippets/multimodal.mdx';
19+
20+
LanceDB handles multimodal data—images, audio, video, and PDF files—natively by storing the raw bytes in a binary column alongside your vectors and metadata. This approach simplifies your data infrastructure by keeping the raw assets and their embeddings in the same database, eliminating the need for separate object storage for many use cases.
21+
22+
This guide demonstrates how to ingest, store, and retrieve image data using standard binary columns, and also introduces the **Lance Blob API** for optimized handling of larger multimodal files.
23+
24+
## Storing binary data
25+
26+
To store binary data, you need to use the `pa.binary()` data type in your Arrow schema. In Python, this corresponds to `bytes` objects if you're using LanceDB's Pydantic `LanceModel` to define the schema.
27+
28+
### 1. Setup and imports
29+
30+
First, let's import the necessary libraries. We'll use `PIL` (Pillow) for image handling and `io` for byte conversion.
31+
32+
<CodeGroup>
33+
<CodeBlock filename="Python" language="Python" icon="python">
34+
{MultimodalImports}
35+
</CodeBlock>
36+
</CodeGroup>
37+
38+
### 2. Preparing data
39+
40+
For this example, we'll create some dummy in-memory images. In a real application, you would read these from files or an API. The key is to convert your data (image, audio, etc.) into a raw `bytes` object.
41+
42+
<CodeGroup>
43+
<CodeBlock filename="Python" language="Python" icon="python">
44+
{CreateDummyData}
45+
</CodeBlock>
46+
</CodeGroup>
47+
48+
### 3. Defining the schema
49+
50+
When creating the table, it is **highly recommended** to define the schema explicitly. This ensures that your binary data is correctly interpreted as a `binary` type by Arrow/LanceDB and not as a generic string or list.
51+
52+
<CodeGroup>
53+
<CodeBlock filename="Python" language="Python" icon="python">
54+
{DefineSchema}
55+
</CodeBlock>
56+
</CodeGroup>
57+
58+
### 4. Ingesting data
59+
60+
Now, create the table using the data and the defined schema.
61+
62+
<CodeGroup>
63+
<CodeBlock filename="Python" language="Python" icon="python">
64+
{IngestData}
65+
</CodeBlock>
66+
</CodeGroup>
67+
68+
## Retrieving and using blobs
69+
70+
When you search your LanceDB table, you can retrieve the binary column just like any other metadata.
71+
72+
<CodeGroup>
73+
<CodeBlock filename="Python" language="Python" icon="python">
74+
{SearchData}
75+
</CodeBlock>
76+
</CodeGroup>
77+
78+
### Converting bytes back to objects
79+
80+
Once you have the `bytes` data back from the search result, you can decode it back into its original format (e.g., a PIL Image, an Audio buffer, etc.).
81+
82+
<CodeGroup>
83+
<CodeBlock filename="Python" language="Python" icon="python">
84+
{ProcessResults}
85+
</CodeBlock>
86+
</CodeGroup>
87+
88+
## Large Blobs (Blob API)
89+
90+
For larger files like high-resolution images or videos, Lance provides a specialized **Blob API**. By using `pa.large_binary()` and specific metadata, you enable **lazy loading** and optimized encoding. This allows you to work with massive datasets without loading all binary data into memory upfront.
91+
92+
### 1. Defining a blob schema
93+
94+
To use the Blob API, you must mark the column with `{"lance-encoding:blob": "true"}` metadata.
95+
96+
<CodeGroup>
97+
<CodeBlock filename="Python" language="Python" icon="python">
98+
{BlobApiSchema}
99+
</CodeBlock>
100+
</CodeGroup>
101+
102+
### 2. Ingesting large blobs
103+
104+
You can then ingest data normally, and Lance will handle the optimized storage.
105+
106+
<CodeGroup>
107+
<CodeBlock filename="Python" language="Python" icon="python">
108+
{BlobApiIngest}
109+
</CodeBlock>
110+
</CodeGroup>
111+
112+
<Card>
113+
For more advanced usage, including random access and file-like reading of blobs, see the
114+
Lance format's [blob API documentation](https://lance.org/guide/blob/).
115+
</Card>
116+
117+
## Other modalities
118+
119+
The `pa.binary()` and `pa.large_binary()` types are universal. You can use this same pattern for other types of multimodal data:
120+
121+
- **Audio:** Read `.wav` or `.mp3` files as bytes.
122+
- **Video:** Store video transitions or full clips using the Blob API.
123+
- **PDFs/Documents:** Store the raw file content for document search.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ dependencies = [
1111
"pydantic>=2.12.4",
1212
"pytest>=9.0.1",
1313
"pytest-asyncio>=1.3.0",
14+
"Pillow>=11.0.0",
1415
]

tests/py/test_multimodal.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright The LanceDB Authors
3+
4+
import pytest
5+
try:
6+
import lancedb
7+
import numpy as np
8+
import pyarrow as pa
9+
import io
10+
from PIL import Image
11+
except ImportError:
12+
pass
13+
14+
# --8<-- [start:multimodal_imports]
15+
import lancedb
16+
import pyarrow as pa
17+
import pandas as pd
18+
import numpy as np
19+
import io
20+
from PIL import Image
21+
# --8<-- [end:multimodal_imports]
22+
23+
def test_multimodal_ingestion(db_path_factory):
24+
# Ensure dependencies are available
25+
pytest.importorskip("PIL")
26+
pytest.importorskip("lancedb")
27+
pytest.importorskip("numpy")
28+
29+
# --8<-- [start:create_dummy_data]
30+
# Create some dummy images
31+
def create_dummy_image(color):
32+
img = Image.new('RGB', (100, 100), color=color)
33+
buf = io.BytesIO()
34+
img.save(buf, format='PNG')
35+
return buf.getvalue()
36+
37+
# Create dataset with metadata, vectors, and image blobs
38+
data = [
39+
{
40+
"id": 1,
41+
"filename": "red_square.png",
42+
"vector": np.random.rand(128).astype(np.float32),
43+
"image_blob": create_dummy_image('red'),
44+
"label": "red"
45+
},
46+
{
47+
"id": 2,
48+
"filename": "blue_square.png",
49+
"vector": np.random.rand(128).astype(np.float32),
50+
"image_blob": create_dummy_image('blue'),
51+
"label": "blue"
52+
}
53+
]
54+
# --8<-- [end:create_dummy_data]
55+
56+
# --8<-- [start:define_schema]
57+
# Define schema explictly to ensure image_blob is treated as binary
58+
schema = pa.schema([
59+
pa.field("id", pa.int32()),
60+
pa.field("filename", pa.string()),
61+
pa.field("vector", pa.list_(pa.float32(), 128)),
62+
pa.field("image_blob", pa.binary()), # Important: Use pa.binary() for blobs
63+
pa.field("label", pa.string())
64+
])
65+
# --8<-- [end:define_schema]
66+
67+
db_uri = db_path_factory("multimodal_db")
68+
db = lancedb.connect(db_uri)
69+
70+
# --8<-- [start:ingest_data]
71+
tbl = db.create_table("images", data=data, schema=schema, mode="overwrite")
72+
# --8<-- [end:ingest_data]
73+
74+
assert len(tbl) == 2
75+
76+
# --8<-- [start:search_data]
77+
# Search for similar images
78+
query_vector = np.random.rand(128).astype(np.float32)
79+
results = tbl.search(query_vector).limit(1).to_pandas()
80+
# --8<-- [end:search_data]
81+
82+
# --8<-- [start:process_results]
83+
# Convert back to PIL Image
84+
for _, row in results.iterrows():
85+
image_bytes = row['image_blob']
86+
image = Image.open(io.BytesIO(image_bytes))
87+
print(f"Retrieved image: {row['filename']}, Size: {image.size}")
88+
# You can now use 'image' with other libraries or display it
89+
# --8<-- [end:process_results]
90+
91+
assert len(results) == 1
92+
93+
def test_blob_api_definition(db_path_factory):
94+
# --8<-- [start:blob_api_schema]
95+
import pyarrow as pa
96+
97+
# Define schema with Blob API metadata for lazy loading
98+
schema = pa.schema([
99+
pa.field("id", pa.int64()),
100+
pa.field(
101+
"video",
102+
pa.large_binary(),
103+
metadata={"lance-encoding:blob": "true"} # Enable Blob API
104+
),
105+
])
106+
# --8<-- [end:blob_api_schema]
107+
108+
# --8<-- [start:blob_api_ingest]
109+
import lancedb
110+
import lance
111+
112+
db = lancedb.connect(db_path_factory("blob_db"))
113+
114+
# Create sample data
115+
data = [
116+
{"id": 1, "video": b"fake_video_bytes_1"},
117+
{"id": 2, "video": b"fake_video_bytes_2"}
118+
]
119+
120+
# Create the table
121+
tbl = db.create_table("videos", data=data, schema=schema)
122+
# --8<-- [end:blob_api_ingest]
123+
assert len(tbl) == 2

0 commit comments

Comments
 (0)