feat(azure-blob): support Azure blob store as a data source (#746)

georgeh0 · wykrrr · web-flow · commit bf3e37316ed8 · 2025-07-13T23:29:29.000-07:00
* add Azure Blob Storage source with full authentication support (#736) - Implement Azure Blob Storage source following S3 pattern - Support multiple authentication methods with priority: * Connection string (highest priority) * SAS token (with proper permissions validation) * Account key (full access) * Anonymous access (public containers) - Include file pattern filtering (include/exclude glob patterns) * feat(azure-blob): simplify and fix authentication * docs: revise the docs for Azure Blob source --------- Co-authored-by: wykrrr <wykrrr@gmail.com>
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -114,7 +114,13 @@ json5 = "0.4.1"
 aws-config = "1.6.2"
 aws-sdk-s3 = "1.85.0"
 aws-sdk-sqs = "1.67.0"
+time = { version = "0.3", features = ["macros", "serde"] }
 numpy = "0.25.0"
 infer = "0.19.0"
 serde_with = { version = "3.13.0", features = ["base64"] }
 google-cloud-aiplatform-v1 = "0.4.0"
+
+azure_core = "0.21.0"
+azure_storage = "0.21.0"
+azure_storage_blobs = "0.21.0"
+azure_identity = "0.21.0"
diff --git a/docs/docs/ops/sources.md b/docs/docs/ops/sources.md
@@ -148,6 +148,72 @@ The spec takes the following fields:
 ### Schema
 
 The output is a [*KTable*](/docs/core/data_types#ktable) with the following sub fields:
+
+*   `filename` (*Str*, key): the filename of the file, including the path, relative to the root directory, e.g. `"dir1/file1.md"`.
+*   `content` (*Str* if `binary` is `False`, otherwise *Bytes*): the content of the file.
+
+
+## AzureBlob
+
+The `AzureBlob` source imports files from Azure Blob Storage.
+
+### Setup for Azure Blob Storage
+
+#### Get Started
+
+If you didn't have experience with Azure Blob Storage, you can refer to the [quickstart](https://learn.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-portal).
+These are actions you need to take:
+
+*   Create a storage account in the [Azure Portal](https://portal.azure.com/).
+*   Create a container in the storage account.
+*   Upload your files to the container.
+*   Grant the user / identity / service principal (depends on your authentication method, see below) access to the container. At minimum, a **Storage Blob Data Reader** role is needed. See [this doc](https://learn.microsoft.com/en-us/azure/storage/blobs/authorize-data-operations-portal) for reference.
+
+#### Authentication
+
+We use Azure’s **Default Credential** system (DefaultAzureCredential) for secure and flexible authentication.
+This allows you to connect to Azure services without putting any secrets in the code or flow spec.
+It automatically chooses the best authentication method based on your environment:
+
+*   On your local machine: uses your Azure CLI login (`az login`) or environment variables.
+
+    ```sh
+    az login
+    # Optional: Set a default subscription if you have more than one
+    az account set --subscription "<YOUR_SUBSCRIPTION_NAME_OR_ID>"
+    ```
+*   In Azure (VM, App Service, AKS, etc.): uses the resource’s Managed Identity.
+*   In automated environments: supports Service Principals via environment variables
+    *   `AZURE_CLIENT_ID`
+    *   `AZURE_TENANT_ID`
+    *   `AZURE_CLIENT_SECRET`
+
+You can refer to [this doc](https://learn.microsoft.com/en-us/azure/developer/python/sdk/authentication/overview) for more details.
+
+### Spec
+
+The spec takes the following fields:
+
+*   `account_name` (`str`): the name of the storage account.
+*   `container_name` (`str`): the name of the container.
+*   `prefix` (`str`, optional): if provided, only files with path starting with this prefix will be imported.
+*   `binary` (`bool`, optional): whether reading files as binary (instead of text).
+*   `included_patterns` (`list[str]`, optional): a list of glob patterns to include files, e.g. `["*.txt", "docs/**/*.md"]`.
+    If not specified, all files will be included.
+*   `excluded_patterns` (`list[str]`, optional): a list of glob patterns to exclude files, e.g. `["*.tmp", "**/*.log"]`.
+    Any file or directory matching these patterns will be excluded even if they match `included_patterns`.
+    If not specified, no files will be excluded.
+
+    :::info
+
+    `included_patterns` and `excluded_patterns` are using Unix-style glob syntax. See [globset syntax](https://docs.rs/globset/latest/globset/index.html#syntax) for the details.
+
+    :::
+
+### Schema
+
+The output is a [*KTable*](/docs/core/data_types#ktable) with the following sub fields:
+
 *   `filename` (*Str*, key): the filename of the file, including the path, relative to the root directory, e.g. `"dir1/file1.md"`.
 *   `content` (*Str* if `binary` is `False`, otherwise *Bytes*): the content of the file.
 
diff --git a/examples/azure_blob_embedding/.env.example b/examples/azure_blob_embedding/.env.example
@@ -0,0 +1,7 @@
+# Database Configuration
+COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
+
+# Azure Blob Storage Configuration (Public test container - ready to use!)
+AZURE_STORAGE_ACCOUNT_NAME=testnamecocoindex1
+AZURE_BLOB_CONTAINER_NAME=testpublic1
+AZURE_BLOB_PREFIX=
diff --git a/examples/azure_blob_embedding/.gitignore b/examples/azure_blob_embedding/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/examples/azure_blob_embedding/README.md b/examples/azure_blob_embedding/README.md
@@ -0,0 +1,65 @@
+This example builds an embedding index based on files stored in an Azure Blob Storage container.
+It continuously updates the index as files are added / updated / deleted in the source container:
+it keeps the index in sync with the Azure Blob Storage container effortlessly.
+
+## Prerequisite
+
+Before running the example, you need to:
+
+1.  [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
+
+2.  Prepare for Azure Blob Storage.
+    See [Setup for Azure Blob Storage](https://cocoindex.io/docs/ops/sources#setup-for-azure-blob-storage) for more details.
+
+3.  Create a `.env` file with your Azure Blob Storage container name and (optionally) prefix.
+    Start from copying the `.env.example`, and then edit it to fill in your bucket name and prefix.
+
+    ```bash
+    cp .env.example .env
+    $EDITOR .env
+    ```
+
+    Example `.env` file:
+    ```
+    # Database Configuration
+    DATABASE_URL=postgresql://localhost:5432/cocoindex
+
+    # Azure Blob Storage Configuration
+    AZURE_BLOB_STORAGE_ACCOUNT_NAME=your-account-name
+    AZURE_BLOB_STORAGE_CONTAINER_NAME=your-container-name
+    ```
+
+## Run
+
+Install dependencies:
+
+```sh
+pip install -e .
+```
+
+Run:
+
+```sh
+python main.py
+```
+
+During running, it will keep observing changes in the Amazon S3 bucket and update the index automatically.
+At the same time, it accepts queries from the terminal, and performs search on top of the up-to-date index.
+
+
+## CocoInsight
+CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9).
+
+Run CocoInsight to understand your RAG data pipeline:
+
+```sh
+cocoindex server -ci main.py
+```
+
+You can also add a `-L` flag to make the server keep updating the index to reflect source changes at the same time:
+
+```sh
+cocoindex server -ci -L main.py
+```
+
+Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
diff --git a/examples/azure_blob_embedding/main.py b/examples/azure_blob_embedding/main.py
@@ -0,0 +1,125 @@
+from dotenv import load_dotenv
+from psycopg_pool import ConnectionPool
+import cocoindex
+import os
+from typing import Any
+
+
+@cocoindex.transform_flow()
+def text_to_embedding(
+    text: cocoindex.DataSlice[str],
+) -> cocoindex.DataSlice[list[float]]:
+    """
+    Embed the text using a SentenceTransformer model.
+    This is a shared logic between indexing and querying, so extract it as a function.
+    """
+    return text.transform(
+        cocoindex.functions.SentenceTransformerEmbed(
+            model="sentence-transformers/all-MiniLM-L6-v2"
+        )
+    )
+
+
+@cocoindex.flow_def(name="AzureBlobTextEmbedding")
+def azure_blob_text_embedding_flow(
+    flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
+) -> None:
+    """
+    Define an example flow that embeds text from Azure Blob Storage into a vector database.
+    """
+    account_name = os.environ["AZURE_STORAGE_ACCOUNT_NAME"]
+    container_name = os.environ["AZURE_BLOB_CONTAINER_NAME"]
+    prefix = os.environ.get("AZURE_BLOB_PREFIX", None)
+
+    data_scope["documents"] = flow_builder.add_source(
+        cocoindex.sources.AzureBlob(
+            account_name=account_name,
+            container_name=container_name,
+            prefix=prefix,
+            included_patterns=["*.md", "*.mdx", "*.txt", "*.docx"],
+            binary=False,
+        )
+    )
+
+    doc_embeddings = data_scope.add_collector()
+
+    with data_scope["documents"].row() as doc:
+        doc["chunks"] = doc["content"].transform(
+            cocoindex.functions.SplitRecursively(),
+            language="markdown",
+            chunk_size=2000,
+            chunk_overlap=500,
+        )
+
+        with doc["chunks"].row() as chunk:
+            chunk["embedding"] = text_to_embedding(chunk["text"])
+            doc_embeddings.collect(
+                filename=doc["filename"],
+                location=chunk["location"],
+                text=chunk["text"],
+                embedding=chunk["embedding"],
+            )
+
+    doc_embeddings.export(
+        "doc_embeddings",
+        cocoindex.targets.Postgres(),
+        primary_key_fields=["filename", "location"],
+        vector_indexes=[
+            cocoindex.VectorIndexDef(
+                field_name="embedding",
+                metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
+            )
+        ],
+    )
+
+
+def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]:
+    # Get the table name, for the export target in the azure_blob_text_embedding_flow above.
+    table_name = cocoindex.utils.get_target_default_name(
+        azure_blob_text_embedding_flow, "doc_embeddings"
+    )
+    # Evaluate the transform flow defined above with the input query, to get the embedding.
+    query_vector = text_to_embedding.eval(query)
+    # Run the query and get the results.
+    with pool.connection() as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                f"""
+                SELECT filename, text, embedding <=> %s::vector AS distance
+                FROM {table_name} ORDER BY distance LIMIT %s
+            """,
+                (query_vector, top_k),
+            )
+            return [
+                {"filename": row[0], "text": row[1], "score": 1.0 - row[2]}
+                for row in cur.fetchall()
+            ]
+
+
+def _main() -> None:
+    # Initialize the database connection pool.
+    pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
+
+    azure_blob_text_embedding_flow.setup()
+    update_stats = azure_blob_text_embedding_flow.update()
+    print(update_stats)
+
+    # Run queries in a loop to demonstrate the query capabilities.
+    while True:
+        query = input("Enter search query (or Enter to quit): ")
+        if query == "":
+            break
+        # Run the query function with the database connection pool and the query.
+        results = search(pool, query)
+        print("\nSearch results:")
+        for result in results:
+            print(f"[{result['score']:.3f}] {result['filename']}")
+            print(f"    {result['text']}")
+            print("---")
+        print()
+
+
+if __name__ == "__main__":
+    load_dotenv()
+    cocoindex.init()
+    _main()
diff --git a/examples/azure_blob_embedding/pyproject.toml b/examples/azure_blob_embedding/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "azure-blob-text-embedding"
+version = "0.1.0"
+description = "Simple example for cocoindex: build embedding index based on Azure Blob Storage files."
+requires-python = ">=3.11"
+dependencies = ["cocoindex[embeddings]>=0.1.63", "python-dotenv>=1.0.1"]
+
+[tool.setuptools]
+packages = []
diff --git a/python/cocoindex/sources.py b/python/cocoindex/sources.py
@@ -43,3 +43,18 @@ class AmazonS3(op.SourceSpec):
     included_patterns: list[str] | None = None
     excluded_patterns: list[str] | None = None
     sqs_queue_url: str | None = None
+
+
+class AzureBlob(op.SourceSpec):
+    """
+    Import data from an Azure Blob Storage container. Supports optional prefix and file filtering by glob patterns.
+    """
+
+    _op_category = op.OpCategory.SOURCE
+
+    account_name: str
+    container_name: str
+    prefix: str | None = None
+    binary: bool = False
+    included_patterns: list[str] | None = None
+    excluded_patterns: list[str] | None = None
diff --git a/src/ops/registration.rs b/src/ops/registration.rs
@@ -11,6 +11,7 @@ fn register_executor_factories(registry: &mut ExecutorFactoryRegistry) -> Result
     sources::local_file::Factory.register(registry)?;
     sources::google_drive::Factory.register(registry)?;
     sources::amazon_s3::Factory.register(registry)?;
+    sources::azure_blob::Factory.register(registry)?;
 
     functions::parse_json::Factory.register(registry)?;
     functions::split_recursively::register(registry)?;
diff --git a/src/ops/sources/azure_blob.rs b/src/ops/sources/azure_blob.rs
diff --git a/src/ops/sources/mod.rs b/src/ops/sources/mod.rs