From ffb61c409798acb05f53c1288e07e106337adad5 Mon Sep 17 00:00:00 2001 From: Phillip Jones Date: Sun, 30 Mar 2025 15:34:29 -0700 Subject: [PATCH 1/9] Adds documentation for R2 Data Catalog --- src/content/docs/r2/api/tokens.mdx | 18 +- .../r2/data-catalog/config-examples/index.mdx | 16 + .../config-examples/pyiceberg.mdx | 50 +++ .../config-examples/snowflake.mdx | 62 ++++ .../r2/data-catalog/config-examples/spark.mdx | 175 +++++++++++ .../docs/r2/data-catalog/get-started.mdx | 291 ++++++++++++++++++ src/content/docs/r2/data-catalog/index.mdx | 54 ++++ src/content/docs/r2/data-migration/index.mdx | 83 ++--- src/content/docs/r2/demos.mdx | 18 +- src/content/docs/r2/examples/index.mdx | 5 +- src/content/docs/r2/platform/index.mdx | 4 +- src/content/docs/r2/pricing.mdx | 24 +- src/content/docs/r2/reference/index.mdx | 6 +- src/content/docs/r2/tutorials/index.mdx | 5 +- .../partials/workers/wrangler-commands/r2.mdx | 41 +++ 15 files changed, 782 insertions(+), 70 deletions(-) create mode 100644 src/content/docs/r2/data-catalog/config-examples/index.mdx create mode 100644 src/content/docs/r2/data-catalog/config-examples/pyiceberg.mdx create mode 100644 src/content/docs/r2/data-catalog/config-examples/snowflake.mdx create mode 100644 src/content/docs/r2/data-catalog/config-examples/spark.mdx create mode 100644 src/content/docs/r2/data-catalog/get-started.mdx create mode 100644 src/content/docs/r2/data-catalog/index.mdx diff --git a/src/content/docs/r2/api/tokens.mdx b/src/content/docs/r2/api/tokens.mdx index b870198dd24793e..6de80b3464ea72f 100644 --- a/src/content/docs/r2/api/tokens.mdx +++ b/src/content/docs/r2/api/tokens.mdx @@ -45,12 +45,18 @@ Jurisdictional buckets can only be accessed via the corresponding jurisdictional ## Permissions -| Permission | Description | -| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | -| Admin Read & Write | Allows the ability to create, list and delete buckets, and edit bucket configurations in addition to list, write, and read object access. | -| Admin Read only | Allows the ability to list buckets and view bucket configuration in addition to list and read object access. | -| Object Read & Write | Allows the ability to read, write, and list objects in specific buckets. | -| Object Read only | Allows the ability to read and list objects in specific buckets. | +| Permission | Description | +| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Admin Read & Write | Allows the ability to create, list, and delete buckets, edit bucket configuration, read, write, and list objects, and read and write access to data catalog tables and associated metadata. | +| Admin Read only | Allows the ability to list buckets and view bucket configuration, read and list objects, and read access to data catalog tables and associated metadata. | +| Object Read & Write | Allows the ability to read, write, and list objects in specific buckets. | +| Object Read only | Allows the ability to read and list objects in specific buckets. | + +:::note + +Currently Admin Read & Write or Admin Read only permission is required to interact with and query [R2 Data Catalog](/r2/data-catalog/). + +::: ## Create API tokens via API diff --git a/src/content/docs/r2/data-catalog/config-examples/index.mdx b/src/content/docs/r2/data-catalog/config-examples/index.mdx new file mode 100644 index 000000000000000..6736adfa4461e2b --- /dev/null +++ b/src/content/docs/r2/data-catalog/config-examples/index.mdx @@ -0,0 +1,16 @@ +--- +pcx_content_type: navigation +title: Connect to query engines +head: [] +sidebar: + order: 3 + group: + hideIndex: true +description: Find detailed setup instructions for Apache Spark and other common query engines. +--- + +import { DirectoryListing } from "~/components"; + +Below are configuration examples to connect various Iceberg engines to [R2 Data Catalog](/r2/data-catalog/): + + diff --git a/src/content/docs/r2/data-catalog/config-examples/pyiceberg.mdx b/src/content/docs/r2/data-catalog/config-examples/pyiceberg.mdx new file mode 100644 index 000000000000000..ad973b2fc67745d --- /dev/null +++ b/src/content/docs/r2/data-catalog/config-examples/pyiceberg.mdx @@ -0,0 +1,50 @@ +--- +title: PyIceberg +pcx_content_type: example +--- + +Below is an example of using [PyIceberg](https://py.iceberg.apache.org/) to connect to R2 Data Catalog. + +## Prerequisites + +- Sign up for a [Cloudflare account](https://dash.cloudflare.com/sign-up/workers-and-pages). +- Create an [R2 bucket](/r2/buckets/) and enable the data catalog. +- Create an [R2 API token](/r2/api/tokens/) with both [R2 and data catalog permissions](/r2/api/tokens/#permissions). +- Install the [PyIceberg](https://py.iceberg.apache.org/#installation) and [PyArrow](https://arrow.apache.org/docs/python/install.html) libraries. + +## Example usage + +```py +import pyarrow as pa +from pyiceberg.catalog.rest import RestCatalog +from pyiceberg.exceptions import NamespaceAlreadyExistsError + +# Define catalog connection details (replace variables) +WAREHOUSE = "" +TOKEN = "" +CATALOG_URI = "" + +# Connect to R2 Data Catalog +catalog = RestCatalog( + name="my_catalog", + warehouse=WAREHOUSE, + uri=CATALOG_URI, + token=TOKEN, +) + +# Create default namespace +catalog.create_namespace("default") + +# Create simple PyArrow table +df = pa.table({ + "id": [1, 2, 3], + "name": ["Alice", "Bob", "Charlie"], +}) + +# Create an Iceberg table +test_table = ("default", "my_table") +table = catalog.create_table( + test_table, + schema=df.schema, +) +``` diff --git a/src/content/docs/r2/data-catalog/config-examples/snowflake.mdx b/src/content/docs/r2/data-catalog/config-examples/snowflake.mdx new file mode 100644 index 000000000000000..8f04b79d32e1dac --- /dev/null +++ b/src/content/docs/r2/data-catalog/config-examples/snowflake.mdx @@ -0,0 +1,62 @@ +--- +title: Snowflake +pcx_content_type: example +--- + +Below is an example of using [Snowflake](https://docs.snowflake.com/en/user-guide/tables-iceberg-configure-catalog-integration-rest) to connect and query data from R2 Data Catalog (read-only). + +## Prerequisites + +- Sign up for a [Cloudflare account](https://dash.cloudflare.com/sign-up/workers-and-pages). +- Create an [R2 bucket](/r2/buckets/) and enable the data catalog. +- Create an [R2 API token](/r2/api/tokens/) with both [R2 and data catalog permissions](/r2/api/tokens/#permissions). +- A [Snowflake](https://www.snowflake.com/) account with the necessary privileges to create external volumes and catalog integrations. + +## Example usage + +In your Snowflake [SQL worksheet](https://docs.snowflake.com/en/user-guide/ui-snowsight-worksheets-gs) or [notebook](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks) run the following commands: + +```sql +-- Create a database (if you don't already have one) to organize your external data +CREATE DATABASE IF NOT EXISTS r2_example_db; + +-- Create an external volume pointing to your R2 bucket +CREATE OR REPLACE EXTERNAL VOLUME ext_vol_r2 + STORAGE_LOCATIONS = ( + ( + NAME = 'my_r2_storage_location' + STORAGE_PROVIDER = 'S3COMPAT' + STORAGE_BASE_URL = 's3compat://' + CREDENTIALS = ( + AWS_KEY_ID = '' + AWS_SECRET_KEY = '' + ) + STORAGE_ENDPOINT = '.r2.cloudflarestorage.com' + ) + ) + ALLOW_WRITES = FALSE; + +-- Create a catalog integration for R2 Data Catalog (read-only) +CREATE OR REPLACE CATALOG INTEGRATION r2_data_catalog + CATALOG_SOURCE = ICEBERG_REST + TABLE_FORMAT = ICEBERG + CATALOG_NAMESPACE = 'default' + REST_CONFIG = ( + CATALOG_URI = '' + CATALOG_NAME = '' + ) + REST_AUTHENTICATION = ( + TYPE = BEARER + BEARER_TOKEN = '' + ) + ENABLED = TRUE; + +-- Create an Apache Iceberg table in your selected Snowflake database +CREATE ICEBERG TABLE my_iceberg_table + CATALOG = 'r2_data_catalog' + EXTERNAL_VOLUME = 'ext_vol_r2' + CATALOG_TABLE_NAME = 'my_table'; -- Name of existing table in your R2 data catalog + +-- Query your Iceberg table +SELECT * FROM my_iceberg_table; +``` diff --git a/src/content/docs/r2/data-catalog/config-examples/spark.mdx b/src/content/docs/r2/data-catalog/config-examples/spark.mdx new file mode 100644 index 000000000000000..db4532e5df5cfd8 --- /dev/null +++ b/src/content/docs/r2/data-catalog/config-examples/spark.mdx @@ -0,0 +1,175 @@ +--- +title: Spark +pcx_content_type: example +--- + +Below is an example of how you can build an [Apache Spark](https://spark.apache.org/) application (with Scala) which connects to the R2 Data Catalog. This application is built to run locally, but it can be adapted to run on a cluster. + +## Prerequisites + +- Sign up for a [Cloudflare account](https://dash.cloudflare.com/sign-up/workers-and-pages). +- Create an [R2 bucket](/r2/buckets/) and enable the data catalog. +- Create an [R2 API token](/r2/api/tokens/) with both [R2 and data catalog permissions](/r2/api/tokens/#permissions). +- Install Java 17, Spark 3.5.3, and SBT 1.10.11 + - Note: The specific versions of tools are critical for getting things to work in this example. + - Tip: [“SDKMAN”](https://sdkman.io/) is a convenient package manager for installing SDKs. + +## Example usage + +To start, create a new empty project directory somewhere on your machine. Inside that directory, create the following file at `src/main/scala/com/example/R2DataCatalogDemo.scala`. This will serve as the main entry point for your Spark application. + +```java +package com.example + +import org.apache.spark.sql.SparkSession + +object R2DataCatalogDemo { + def main(args: Array[String]): Unit = { + + val uri = sys.env("CATALOG_URI") + val warehouse = sys.env("WAREHOUSE") + val token = sys.env("TOKEN") + + val spark = SparkSession.builder() + .appName("My R2 Data Catalog Demo") + .master("local[*]") + .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") + .config("spark.sql.catalog.mydemo", "org.apache.iceberg.spark.SparkCatalog") + .config("spark.sql.catalog.mydemo.type", "rest") + .config("spark.sql.catalog.mydemo.uri", uri) + .config("spark.sql.catalog.mydemo.warehouse", warehouse) + .config("spark.sql.catalog.mydemo.token", token) + .getOrCreate() + + import spark.implicits._ + + val data = Seq( + (1, "Alice", 25), + (2, "Bob", 30), + (3, "Charlie", 35), + (4, "Diana", 40) + ).toDF("id", "name", "age") + + spark.sql("USE mydemo") + + spark.sql("CREATE NAMESPACE IF NOT EXISTS demoNamespace") + + data.writeTo("demoNamespace.demotable").createOrReplace() + + val readResult = spark.sql("SELECT * FROM demoNamespace.demotable WHERE age > 30") + println("Records with age > 30:") + readResult.show() + } +} +``` + +For building this application and managing dependencies, we'll use [sbt (“simple build tool”)](https://www.scala-sbt.org/). The following is an example `build.sbt` file to place at the root of your project. It is configured to produce a "fat JAR", bundling all required dependencies. + +```java +name := "R2DataCatalogDemo" + +version := "1.0" + +val sparkVersion = "3.5.3" +val icebergVersion = "1.8.1" + +// You need to use binaries of Spark compiled with either 2.12 or 2.13; and 2.12 is more common. +// If you download Spark 3.5.3 with sdkman, then it comes with 2.12.18 +scalaVersion := "2.12.18" + +libraryDependencies ++= Seq( + "org.apache.spark" %% "spark-core" % sparkVersion, + "org.apache.spark" %% "spark-sql" % sparkVersion, + "org.apache.iceberg" % "iceberg-core" % icebergVersion, + "org.apache.iceberg" % "iceberg-spark-runtime-3.5_2.12" % icebergVersion, + "org.apache.iceberg" % "iceberg-aws-bundle" % icebergVersion, +) + +// build a fat JAR with all dependencies +assembly / assemblyMergeStrategy := { + case PathList("META-INF", "services", xs @ _*) => MergeStrategy.concat + case PathList("META-INF", xs @ _*) => MergeStrategy.discard + case "reference.conf" => MergeStrategy.concat + case "application.conf" => MergeStrategy.concat + case x if x.endsWith(".properties") => MergeStrategy.first + case x => MergeStrategy.first +} + +// For Java 17 Compatability +Compile / javacOptions ++= Seq("--release", "17") +``` + +To enable the [sbt-assembly plugin](https://github.com/sbt/sbt-assembly?tab=readme-ov-file) (used to build fat JARs), add the following to a new file at `project/assembly.sbt`: + +``` +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "1.2.0") +``` + +Make sure Java, Spark, and sbt are installed and available in your shell. If you're using SDKMAN, you can install them as shown below: + +```bash +sdk install java 17.0.14-amzn +sdk install spark 3.5.3 +sdk install sbt 1.10.11 +``` + +With everything installed, you can now build the project using sbt. This will generate a single bundled JAR file. + +```bash +sbt clean assembly +``` + +After building, the output JAR should be located at `target/scala-2.12/R2DataCatalogDemo-assembly-1.0.jar`. + +To run the application, you'll use `spark-submit`. Below is an example shell script (`submit.sh`) that includes the necessary Java compatability flags for Spark on Java 17: + +``` +# We need to set these "--add-opens" so that Spark can run on Java 17 (it needs access to +# parts of the JVM which have been modularized and made internal). +JAVA_17_COMPATABILITY="--add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED" + +spark-submit \ +--conf "spark.driver.extraJavaOptions=$JAVA_17_COMPATABILITY" \ +--conf "spark.executor.extraJavaOptions=$JAVA_17_COMPATABILITY" \ +--class com.example.R2DataCatalogDemo target/scala-2.12/R2DataCatalogDemo-assembly-1.0.jar +``` + +Before running it, make sure the script is executable: + +```bash +chmod +x submit.sh +``` + +At this point, your project directory should be structured like this: + +``` +. +├── Makefile +├── README.md +├── build.sbt +├── project +│ ├── assembly.sbt +│ ├── build.properties +│ └── project +├── spark-submit.sh +└── src + └── main + └── scala + └── com + └── example + └── R2DataCatalogDemo.scala +``` + +Before submitting the job, make sure you have the required environment variable set for your catalog URI, warehouse, and [Cloudflare API token](/r2/api/tokens/). + +```bash +export CATALOG_URI= +export WAREHOUSE= +export TOKEN= +``` + +You're now ready to run the job: + +```bash +./submit.sh +``` diff --git a/src/content/docs/r2/data-catalog/get-started.mdx b/src/content/docs/r2/data-catalog/get-started.mdx new file mode 100644 index 000000000000000..5d610dbaaf74b80 --- /dev/null +++ b/src/content/docs/r2/data-catalog/get-started.mdx @@ -0,0 +1,291 @@ +--- +pcx_content_type: get-started +title: Get started +head: [] +sidebar: + order: 2 +description: Learn how to enable the R2 Data Catalog on your bucket, load sample data, and run your first query. +--- + +import { + Render, + PackageManagers, + Steps, + FileTree, + Tabs, + TabItem, + TypeScriptExample, + WranglerConfig, + LinkCard, +} from "~/components"; + +## Overview + +This guide will instruct you through: + +- Creating your first [R2 bucket](/r2/buckets/) and enabling its [data catalog](/r2/data-catalog/). +- Creating an [API token](/r2/api/tokens/) needed for query engines to authenticate with your data catalog. +- Using [PyIceberg](https://py.iceberg.apache.org/) to create your first Iceberg table in a [marimo](https://marimo.io/) Python notebook. +- Using [PyIceberg](https://py.iceberg.apache.org/) to load sample data into your table and query it. + +## Prerequisites + + + +## 1. Create an R2 bucket + + + + + +1. If not already logged in, run: + + ``` + npx wrangler login + ``` + +2. Then, enable the catalog on your chosen R2 bucket: + + ``` + npx wrangler r2 bucket r2-data-catalog-tutorial + ``` + + + + + + + +1. From the Cloudflare dashboard, select **R2 Object Storage** from the sidebar. +2. Select the bucket you want to enable as a data catalog. +3. Switch to the **Settings** tab, scroll down to **R2 Data Catalog**, and select **Enable**. +4. Once enabled, note the **Catalog URI** and **Warehouse name**. + + + + +## 2. Enable the data catalog for your bucket + + + + +Then, enable the catalog on your chosen R2 bucket: + + ``` + npx wrangler r2 bucket catalog enable r2-data-catalog-tutorial + ``` + + + + + +1. From the Cloudflare dashboard, select **R2 Object Storage** from the sidebar. +2. Select the bucket you want to enable as a data catalog. +3. Switch to the **Settings** tab, scroll down to **R2 Data Catalog**, and select **Enable**. +4. Once enabled, note the **Catalog URI** and **Warehouse name**. + + + + +## 3. Create an API token + +Iceberg clients (including [PyIceberg](https://py.iceberg.apache.org/)) must authenticate to the catalog with a [Cloudflare API token](/fundamentals/api/get-started/create-token/) that has both R2 and catalog permissions. + + +1. From the Cloudflare dashboard, select **R2 Object Storage** from the sidebar. + +2. Expand the **API** dropdown and select **Manage API tokens**. + +3. Select **Create API token**. + +4. Select the **R2 Token** text to edit your API token name. + +5. Under **Permissions**, choose the **Admin Read & Write** permission. + +6. Select **Create API Token**. + +7. Note the **Token value**, you will need this. + + + +## 4. Install uv + +Next, you'll need to install a Python package manager, in this guide we'll be using [uv](https://docs.astral.sh/uv/). If you don't already have uv installed, follow the [installing uv guide](https://docs.astral.sh/uv/getting-started/installation/). + +## 5. Install marimo + +We'll be using [marimo](https://github.com/marimo-team/marimo) as a Python notebook. + + +1. Create a directory where our notebook will live: + + ``` + mkdir r2-data-catalog-notebook + ``` + +2. Change into our new directory: + + ``` + cd r2-data-catalog-notebook + ``` + +3. Create a new Python virtual environment: + + ``` + uv venv + ``` + +4. Activate the Python virtual environment: + + ``` + source .venv/bin/activate + ``` + +5. Install marimo with uv: + + ```py + uv pip install marimo + ``` + + + +## 6. Create a Python notebook to interact with the data warehouse + + +1. Create a file called `r2-data-catalog-tutorial.py`. + +2. Paste the following code snippet into your `r2-data-catalog-tutorial.py` file: + + ```py + import marimo + + __generated_with = "0.11.31" + app = marimo.App(width="medium") + + + @app.cell + def _(): + import marimo as mo + return (mo,) + + + @app.cell + def _(): + import pandas + import pyarrow as pa + import pyarrow.compute as pc + import pyarrow.parquet as pq + + from pyiceberg.catalog.rest import RestCatalog + from pyiceberg.exceptions import NamespaceAlreadyExistsError + + # Define catalog connection details (replace variables) + WAREHOUSE = "" + TOKEN = "" + CATALOG_URI = "" + + # Connect to R2 Data Catalog + catalog = RestCatalog( + name="my_catalog", + warehouse=WAREHOUSE, + uri=CATALOG_URI, + token=TOKEN, + ) + return ( + CATALOG_URI, + NamespaceAlreadyExistsError, + RestCatalog, + TOKEN, + WAREHOUSE, + catalog, + pa, + pandas, + pc, + pq, + ) + + + @app.cell + def _(NamespaceAlreadyExistsError, catalog): + # Create default namespace if needed + try: + catalog.create_namespace("default") + except NamespaceAlreadyExistsError: + pass + return + + + @app.cell + def _(pa): + # Create simple PyArrow table + df = pa.table({ + "id": [1, 2, 3], + "name": ["Alice", "Bob", "Charlie"], + "score": [80.0, 92.5, 88.0], + }) + return (df,) + + + @app.cell + def _(catalog, df): + # Create or load Iceberg table + test_table = ("default", "people") + if not catalog.table_exists(test_table): + print(f"Creating table: {test_table}") + table = catalog.create_table( + test_table, + schema=df.schema, + ) + else: + table = catalog.load_table(test_table) + return table, test_table + + + @app.cell + def _(df, table): + # Append data + table.append(df) + return + + + @app.cell + def _(table): + print("Table contents:") + scanned = table.scan().to_arrow() + print(scanned.to_pandas()) + return (scanned,) + + + @app.cell + def _(): + # Optional cleanup. To run uncomment and run cell + # print(f"Deleting table: {test_table}") + # catalog.drop_table(test_table) + # print("Table dropped.") + return + + + if __name__ == "__main__": + app.run() + ``` + +3. Replace the `CATALOG_URI`, `WAREHOUSE` and `TOKEN` variables with your values from sections **2** and **3** respectively. + + +In the Python notebook above, you: + +1. Connect to your catalog. +2. Create the `default` namespace. +3. Create a simple PyArrow table. +4. Create (or load) the `people` table in the `default` namespace. +5. Append sample data to the table. +6. Print the contents of the table. +7. (Optional) Drop the `people` table we created for this tutorial. + +## Learn more + + diff --git a/src/content/docs/r2/data-catalog/index.mdx b/src/content/docs/r2/data-catalog/index.mdx new file mode 100644 index 000000000000000..85e7d54ea15becc --- /dev/null +++ b/src/content/docs/r2/data-catalog/index.mdx @@ -0,0 +1,54 @@ +--- +pcx_content_type: navigation +title: R2 Data Catalog +sidebar: + order: 7 + group: + badge: Beta +head: [] +description: A managed Apache Iceberg data catalog built directly into R2 buckets. +--- + +import { Render, LinkCard } from "~/components"; + +:::note +R2 Data Catalog is in **public beta**, and any developer with an [R2 subscription](/r2/pricing/) can start using it. Currently, outside of standard R2 storage and operations, you will not be billed for your use of R2 Data Catalog. +::: + +R2 Data Catalog is a managed [Apache Iceberg](https://iceberg.apache.org/) data catalog built directly into your R2 bucket. It exposes a standard Iceberg REST catalog interface, so you can connect the engines you already use, like [Spark](/r2/data-catalog/config-examples/spark/), [Snowflake](/r2/data-catalog/config-examples/snowflake/), and [PyIceberg](/r2/data-catalog/config-examples/pyiceberg/). + +R2 Data Catalog makes it easy to turn an R2 bucket into a data warehouse or lakehouse for a variety of analytical workloads including log analytics, business intelligence, and data pipelines. R2's zero-egress fee model means that data users and consumers can access and analyze data from different clouds, data platforms, or regions without incurring transfer costs. + +Refer to the [get started guide](/r2/data-catalog/get-started/) to start with R2 Data Catalog. + +## What is Apache Iceberg? + +[Apache Iceberg](https://iceberg.apache.org/) is an open table format designed to handle large-scale analytics datasets stored in object storage. Key features include: + +- ACID transactions - Ensures reliable, concurrent reads and writes with full data integrity. +- Optimized metadata - Avoids costly full table scans by using indexed metadata for faster queries. +- Full schema evolution - Allows adding, renaming, and deleting columns without rewriting data. + +Iceberg is already [widely supported](https://iceberg.apache.org/vendors/) by engines like Apache Spark, Trino, Snowflake, DuckDB, and ClickHouse, with a fast-growing community behind it. + +## Why do you need a data catalog? + +Although the Iceberg data and metadata files themselves live directly in object storage (like [R2](https://developers.cloudflare.com/r2/)), the list of tables and pointers to the current metadata need to be tracked centrally by a data catalog. + +Think of a data catalog as a library's index system. While books (your data) are physically distributed across shelves (object storage), the index provides a single source of truth about what books exist, their locations, and their latest editions. Without this index, readers (query engines) would waste time searching for books, might access outdated versions, or could accidentally shelve new books in ways that make them unfindable. + +Similarly, data catalogs ensure consistent, coordinated access, which allows multiple query engines to safely read from and write to the same tables without conflicts or data corruption. + +## Learn more + + + + diff --git a/src/content/docs/r2/data-migration/index.mdx b/src/content/docs/r2/data-migration/index.mdx index 0b43d3ce4ffad90..381d2ea810ef5ba 100644 --- a/src/content/docs/r2/data-migration/index.mdx +++ b/src/content/docs/r2/data-migration/index.mdx @@ -1,53 +1,56 @@ --- -title: Data Migration +title: Data migration pcx_content_type: navigation learning_center: title: What is data migration? link: https://www.cloudflare.com/learning/cloud/what-is-data-migration/ sidebar: order: 3 - --- Quickly and easily migrate data from other cloud providers to R2. Explore each option further by navigating to their respective documentation page. - - - - - - - - - - - - - - - + + + + + + + + + + + + + + +
- Name - - Description - - When to use -
- Super Slurper - - Quickly migrate large amounts of data from other cloud providers to R2. - -
    -
  • For one-time, comprehensive transfers.
  • -
-
- Sippy - - Incremental data migration, populating your R2 bucket as objects are requested. - -
    -
  • For gradual migration that avoids upfront egress fees.
  • -
  • To start serving frequently accessed objects from R2 without a full migration.
  • -
-
+ Name + + Description + + When to use +
+ Super Slurper + + Quickly migrate large amounts of data from other cloud providers to R2. + +
    +
  • For one-time, comprehensive transfers.
  • +
+
+ Sippy + + Incremental data migration, populating your R2 bucket as objects are + requested. + +
    +
  • For gradual migration that avoids upfront egress fees.
  • +
  • + To start serving frequently accessed objects from R2 without a full + migration. +
  • +
+
diff --git a/src/content/docs/r2/demos.mdx b/src/content/docs/r2/demos.mdx index 33d75f6909e7220..8bc6ecf4e982a71 100644 --- a/src/content/docs/r2/demos.mdx +++ b/src/content/docs/r2/demos.mdx @@ -2,11 +2,14 @@ pcx_content_type: navigation title: Demos and architectures sidebar: - order: 9 - + order: 10 --- -import { ExternalResources, GlossaryTooltip, ResourcesBySelector } from "~/components" +import { + ExternalResources, + GlossaryTooltip, + ResourcesBySelector, +} from "~/components"; Learn how you can use R2 within your existing application and architecture. @@ -20,4 +23,11 @@ Explore the following demo applications Explore the following reference architectures that use R2: - + diff --git a/src/content/docs/r2/examples/index.mdx b/src/content/docs/r2/examples/index.mdx index 4f5ff68dd3c5a72..84b5c622e3a5731 100644 --- a/src/content/docs/r2/examples/index.mdx +++ b/src/content/docs/r2/examples/index.mdx @@ -3,11 +3,10 @@ type: overview pcx_content_type: navigation title: Examples sidebar: - order: 7 - + order: 8 --- -import { DirectoryListing, GlossaryTooltip } from "~/components" +import { DirectoryListing, GlossaryTooltip } from "~/components"; Explore the following examples of how to use SDKs and other tools with R2. diff --git a/src/content/docs/r2/platform/index.mdx b/src/content/docs/r2/platform/index.mdx index 80ceb2026b3c092..480e81c8c0a1685 100644 --- a/src/content/docs/r2/platform/index.mdx +++ b/src/content/docs/r2/platform/index.mdx @@ -2,7 +2,7 @@ title: Platform pcx_content_type: navigation sidebar: - order: 9 - group: + order: 11 + group: hideIndex: true --- diff --git a/src/content/docs/r2/pricing.mdx b/src/content/docs/r2/pricing.mdx index 16173085e814958..01d72f995f797d4 100644 --- a/src/content/docs/r2/pricing.mdx +++ b/src/content/docs/r2/pricing.mdx @@ -2,7 +2,7 @@ pcx_content_type: concept title: Pricing sidebar: - order: 11 + order: 13 --- import { InlineBadge } from "~/components"; @@ -80,6 +80,12 @@ For objects stored in Infrequent Access storage, you will be charged for the obj | Standard storage | None | | Infrequent Access storage | 30 days | +## R2 Data Catalog pricing + +R2 Data Catalog is in **public beta**, and any developer with [R2 subscription](/r2/pricing/) can start using it. Currently, outside of standard R2 storage and operations, you will not be billed for your use of R2 Data Catalog. We'll provide at least 30 days notice before we make any changes or start charging for usage + +To learn more about our thinking on future pricing, refer to the [R2 Data Catalog announcement blog](https://blog.cloudflare.com/r2-data-catalog-public-beta). + ## Data migration pricing ### Super Slurper @@ -116,13 +122,13 @@ If a user writes 1,000 objects in R2 for 1 month with an average size of 1 GB an If a user writes 10 objects in R2 for 1 month with an average size of 1 GB and requests 1,000 times per month, the estimated cost for the month would be: -| | Usage | Free Tier | Billable Quantity | Price | -| ------------------ | ------------------------------------------- | ------------ | ----------------- | ---------- | -| Class B Operations | (1,000 objects) \* (1,000 reads per object) | 10 million | 0 | $0.00 | -| Class A Operations | (1,000 objects) \* (1 write per object) | 1 million | 0 | $0.00 | -| Storage | (10 objects) \* (1 GB per object) | 10 GB-months | 0 | $0.00 | -| **TOTAL** | | | | **$0.00** | -| | | | | | +| | Usage | Free Tier | Billable Quantity | Price | +| ------------------ | ------------------------------------------- | ------------ | ----------------- | --------- | +| Class B Operations | (1,000 objects) \* (1,000 reads per object) | 10 million | 0 | $0.00 | +| Class A Operations | (1,000 objects) \* (1 write per object) | 1 million | 0 | $0.00 | +| Storage | (10 objects) \* (1 GB per object) | 10 GB-months | 0 | $0.00 | +| **TOTAL** | | | | **$0.00** | +| | | | | | ### Asset hosting @@ -146,4 +152,4 @@ To learn more about how usage is billed, refer to [Cloudflare Billing Policy](/s No. You are not charged for operations when the caller does not have permission to make the request (HTTP 401 `Unauthorized` response status code). -[^1]: Egressing directly from R2, including via the [Workers API](/r2/api/workers/), [S3 API](/r2/api/s3/), and [`r2.dev` domains](/r2/buckets/public-buckets/#enable-managed-public-access) does not incur data transfer (egress) charges and is free. If you connect other metered services to an R2 bucket, you may be charged by those services. \ No newline at end of file +[^1]: Egressing directly from R2, including via the [Workers API](/r2/api/workers/), [S3 API](/r2/api/s3/), and [`r2.dev` domains](/r2/buckets/public-buckets/#enable-managed-public-access) does not incur data transfer (egress) charges and is free. If you connect other metered services to an R2 bucket, you may be charged by those services. diff --git a/src/content/docs/r2/reference/index.mdx b/src/content/docs/r2/reference/index.mdx index 50174cbd35dc1c7..05f901055d39be7 100644 --- a/src/content/docs/r2/reference/index.mdx +++ b/src/content/docs/r2/reference/index.mdx @@ -2,11 +2,11 @@ title: Reference pcx_content_type: navigation sidebar: - order: 10 + order: 12 group: hideIndex: true --- -import { DirectoryListing } from "~/components" +import { DirectoryListing } from "~/components"; - \ No newline at end of file + diff --git a/src/content/docs/r2/tutorials/index.mdx b/src/content/docs/r2/tutorials/index.mdx index f43ff189d3bc3e1..fe802403a0f3156 100644 --- a/src/content/docs/r2/tutorials/index.mdx +++ b/src/content/docs/r2/tutorials/index.mdx @@ -3,11 +3,10 @@ hideChildren: true pcx_content_type: navigation title: Tutorials sidebar: - order: 8 - + order: 9 --- -import { GlossaryTooltip, ListTutorials } from "~/components" +import { GlossaryTooltip, ListTutorials } from "~/components"; View tutorials to help you get started with R2. diff --git a/src/content/partials/workers/wrangler-commands/r2.mdx b/src/content/partials/workers/wrangler-commands/r2.mdx index 9b6786229f7ceec..f8bfeb2516a40e8 100644 --- a/src/content/partials/workers/wrangler-commands/r2.mdx +++ b/src/content/partials/workers/wrangler-commands/r2.mdx @@ -62,6 +62,47 @@ List R2 bucket in the current account. wrangler r2 bucket list ``` + + +Enable [R2 Data Catalog](/r2/data-catalog/) on an R2 bucket. + +```txt +wrangler r2 bucket catalog enable [OPTIONS] +``` + +- `NAME` + - The name of the bucket to enable R2 Data Catalog for. + + + +Disable [R2 Data Catalog](/r2/data-catalog/) on an R2 bucket. + +```txt +wrangler r2 bucket catalog disable [OPTIONS] +``` + +- `NAME` + - The name of the bucket to disable R2 Data Catalog for. + + + +Get the status of [R2 Data Catalog](/r2/data-catalog/) for an R2 bucket, including catalog URI and warehouse name. + +```txt +wrangler r2 bucket catalog get [OPTIONS] +``` + +- `NAME` + - The name of the R2 bucket whose data catalog status to retrieve. + Set the [CORS configuration](/r2/buckets/cors/) for an R2 bucket from a JSON file. From 5eab4a3ba0b6f9911cb47611868ee19fb21adf24 Mon Sep 17 00:00:00 2001 From: Phillip Jones Date: Sat, 5 Apr 2025 09:40:14 -0700 Subject: [PATCH 2/9] Added managing catalogs documentation and R2 Data Catalog as a product. --- .../r2/data-catalog/config-examples/index.mdx | 4 +- .../docs/r2/data-catalog/get-started.mdx | 8 +- src/content/docs/r2/data-catalog/index.mdx | 8 +- .../r2/data-catalog/managing-catalogs.mdx | 95 +++++++++++++++++++ src/content/products/r2-data-catalog.yaml | 19 ++++ src/icons/r2-data-catalog.svg | 1 + 6 files changed, 131 insertions(+), 4 deletions(-) create mode 100644 src/content/docs/r2/data-catalog/managing-catalogs.mdx create mode 100644 src/content/products/r2-data-catalog.yaml create mode 100644 src/icons/r2-data-catalog.svg diff --git a/src/content/docs/r2/data-catalog/config-examples/index.mdx b/src/content/docs/r2/data-catalog/config-examples/index.mdx index 6736adfa4461e2b..d0d4e7ea80b71ca 100644 --- a/src/content/docs/r2/data-catalog/config-examples/index.mdx +++ b/src/content/docs/r2/data-catalog/config-examples/index.mdx @@ -1,9 +1,9 @@ --- pcx_content_type: navigation -title: Connect to query engines +title: Connect to Iceberg engines head: [] sidebar: - order: 3 + order: 4 group: hideIndex: true description: Find detailed setup instructions for Apache Spark and other common query engines. diff --git a/src/content/docs/r2/data-catalog/get-started.mdx b/src/content/docs/r2/data-catalog/get-started.mdx index 5d610dbaaf74b80..c4d276c50f7661b 100644 --- a/src/content/docs/r2/data-catalog/get-started.mdx +++ b/src/content/docs/r2/data-catalog/get-started.mdx @@ -285,7 +285,13 @@ In the Python notebook above, you: ## Learn more + + diff --git a/src/content/docs/r2/data-catalog/index.mdx b/src/content/docs/r2/data-catalog/index.mdx index 85e7d54ea15becc..300d59214ba2c85 100644 --- a/src/content/docs/r2/data-catalog/index.mdx +++ b/src/content/docs/r2/data-catalog/index.mdx @@ -48,7 +48,13 @@ Similarly, data catalogs ensure consistent, coordinated access, which allows mul /> + + diff --git a/src/content/docs/r2/data-catalog/managing-catalogs.mdx b/src/content/docs/r2/data-catalog/managing-catalogs.mdx new file mode 100644 index 000000000000000..dad8406450cfb60 --- /dev/null +++ b/src/content/docs/r2/data-catalog/managing-catalogs.mdx @@ -0,0 +1,95 @@ +--- +pcx_content_type: configuration +title: Managing catalogs +description: Understand how to manage Iceberg REST catalogs associated with R2 buckets +sidebar: + order: 3 +--- + +import { + Render, + PackageManagers, + Steps, + FileTree, + Tabs, + TabItem, + TypeScriptExample, + WranglerConfig, + LinkCard, +} from "~/components"; + +Learn how to enable and disable [R2 Data Catalog](/r2/data-catalog/) on your buckets and authenticate Iceberg engines using API tokens. + +## Enable R2 Data Catalog on a bucket + +Enabling the catalog on a bucket turns on the REST catalog interface and provides a **Catalog URI** and **Warehouse name** required by Iceberg clients. Once enabled, you can create and manage Iceberg tables in that bucket. + +### Dashboard + + +1. From the Cloudflare dashboard, select **R2 Object Storage** from the sidebar. +2. Select the bucket you want to enable as a data catalog. +3. Switch to the **Settings** tab, scroll down to **R2 Data Catalog**, and select **Enable**. +4. Once enabled, note the **Catalog URI** and **Warehouse name**. + + +### Wrangler CLI + +To enable the catalog on your bucket, run the [`r2 bucket catalog enable command`](/workers/wrangler/commands/#r2-bucket-catalog-enable): + +```bash +npx wrangler r2 bucket catalog enable +``` + +After enabling, Wrangler will return your catalog URI and warehouse name. + +## Disable R2 Data Catalog on a bucket + +When you disable the catalog on a bucket, it immediately stops serving requests from the catalog interface. Any Iceberg table references stored in that catalog become inaccessible until you re-enable it. + +### Dashboard + + +1. From the Cloudflare dashboard, select **R2 Object Storage** from the sidebar. +2. Select the bucket where you want to disable the data catalog. +3. Switch to the **Settings** tab, scroll down to **R2 Data Catalog**, and select **Disable**. + + +### Wrangler CLI + +To disable the catalog on your bucket, run the [`r2 bucket catalog disable command`](/workers/wrangler/commands/#r2-bucket-catalog-disable): + +```bash +npx wrangler r2 bucket catalog disable +``` + +## Authenticate your Iceberg engine + +To connect your Iceberg engine to R2 Data Catalog, you will need a Cloudflare API token with both [R2 and R2 Data Catalog permissions](/r2/api/tokens/#permissions). Iceberg engines that support the REST catalog let you supply this token to authenticate with R2 Data Catalog. + + +1. Create an [R2 API token](/r2/api/tokens/#permissions) with **Admin Read & Write** or **Admin Read only** permission. +2. Copy the **Token value** from your new API token. +3. In your engine configuration, provide this token as a bearer token. + Internally, this token will be sent as: + + ``` + Authorization: Bearer + ``` + + in HTTP requests to your bucket's data catalog. + + +## Learn more + + + + diff --git a/src/content/products/r2-data-catalog.yaml b/src/content/products/r2-data-catalog.yaml new file mode 100644 index 000000000000000..3ec5ed9c87b2071 --- /dev/null +++ b/src/content/products/r2-data-catalog.yaml @@ -0,0 +1,19 @@ +name: R2 Data Catalog + +product: + title: R2 Data Catalog + url: /r2/data-catalog/ + group: Developer platform + additional_groups: [Storage] + tags: [Storage] + +meta: + title: Cloudflare R2 Data Catalog docs + description: Create, manage, and query Iceberg tables stored in R2. + author: "@cloudflare" + +resources: + community: https://community.cloudflare.com/c/developers/workers/40 + dashboard_link: https://dash.cloudflare.com/?to=/:account/r2 + discord: https://discord.com/channels/595317990191398933/940663374377783388 + learning_center: https://www.cloudflare.com/learning/cloud/what-is-object-storage/ diff --git a/src/icons/r2-data-catalog.svg b/src/icons/r2-data-catalog.svg new file mode 100644 index 000000000000000..3d391a2de4036ae --- /dev/null +++ b/src/icons/r2-data-catalog.svg @@ -0,0 +1 @@ + \ No newline at end of file From 4826f79143100238a70fbdfd1d77accc4ce07abb Mon Sep 17 00:00:00 2001 From: Phillip Jones Date: Sat, 5 Apr 2025 14:10:22 -0700 Subject: [PATCH 3/9] Add changelog entry --- .../r2/2025-04-10-r2-data-catalog-beta.mdx | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 src/content/changelog/r2/2025-04-10-r2-data-catalog-beta.mdx diff --git a/src/content/changelog/r2/2025-04-10-r2-data-catalog-beta.mdx b/src/content/changelog/r2/2025-04-10-r2-data-catalog-beta.mdx new file mode 100644 index 000000000000000..165deb633da91a1 --- /dev/null +++ b/src/content/changelog/r2/2025-04-10-r2-data-catalog-beta.mdx @@ -0,0 +1,22 @@ +--- +title: R2 Data Catalog is a managed Apache Iceberg data catalog built directly into R2 buckets +description: A managed Apache Iceberg data catalog built directly into R2 buckets +products: + - r2 +date: 2025-04-10T13:00:00Z +hidden: true +--- + +Today, we're launching [R2 Data Catalog](/r2/data-catalog/) in open beta, a managed Apache Iceberg catalog built directly into your [Cloudflare R2](/r2/) bucket. + +If you're not already familiar with it, [Apache Iceberg](https://iceberg.apache.org/) is an open table format designed to handle large-scale analytics datasets stored in object storage, offering ACID transactions and schema evolution. R2 Data Catalog exposes a standard Iceberg REST catalog interface, so you can connect engines like [Spark](/r2/data-catalog/config-examples/spark/), [Snowflake](/r2/data-catalog/config-examples/snowflake/), and [PyIceberg](/r2/data-catalog/config-examples/pyiceberg/) to start querying your tables using the tools you already know. + +To enable a data catalog on your R2 bucket, find **R2 Data Catalog** in your buckets settings in the dashboard or run: + +```bash +npx wrangler r2 bucket catalog enable my-bucket +``` + +And that's it. You'll get a catalog URI and warehouse you can plug into your favorite Iceberg engines. + +Visit our [getting started guide](/r2/data-catalog/get-started/) for step-by-step instructions on enabling R2 Data Catalog, creating tables, and running your first queries. From fdc400b96e113ad5e8a40f9800fd4df735a52476 Mon Sep 17 00:00:00 2001 From: Jun Lee Date: Mon, 7 Apr 2025 11:25:59 +0100 Subject: [PATCH 4/9] PCX review --- src/content/docs/r2/api/tokens.mdx | 6 +-- .../config-examples/pyiceberg.mdx | 4 +- .../config-examples/snowflake.mdx | 6 +-- .../r2/data-catalog/config-examples/spark.mdx | 52 ++++++++++--------- .../docs/r2/data-catalog/get-started.mdx | 16 +++--- src/content/docs/r2/data-catalog/index.mdx | 4 +- ...aging-catalogs.mdx => manage-catalogs.mdx} | 9 ++-- src/content/docs/r2/pricing.mdx | 16 +++--- 8 files changed, 60 insertions(+), 53 deletions(-) rename src/content/docs/r2/data-catalog/{managing-catalogs.mdx => manage-catalogs.mdx} (94%) diff --git a/src/content/docs/r2/api/tokens.mdx b/src/content/docs/r2/api/tokens.mdx index 6de80b3464ea72f..1f50863aca0b5df 100644 --- a/src/content/docs/r2/api/tokens.mdx +++ b/src/content/docs/r2/api/tokens.mdx @@ -47,14 +47,14 @@ Jurisdictional buckets can only be accessed via the corresponding jurisdictional | Permission | Description | | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Admin Read & Write | Allows the ability to create, list, and delete buckets, edit bucket configuration, read, write, and list objects, and read and write access to data catalog tables and associated metadata. | -| Admin Read only | Allows the ability to list buckets and view bucket configuration, read and list objects, and read access to data catalog tables and associated metadata. | +| Admin Read & Write | Allows the ability to create, list, and delete buckets, edit bucket configuration, read, write, and list objects, and read and write to data catalog tables and associated metadata. | +| Admin Read only | Allows the ability to list buckets and view bucket configuration, read and list objects, and read from the data catalog tables and associated metadata. | | Object Read & Write | Allows the ability to read, write, and list objects in specific buckets. | | Object Read only | Allows the ability to read and list objects in specific buckets. | :::note -Currently Admin Read & Write or Admin Read only permission is required to interact with and query [R2 Data Catalog](/r2/data-catalog/). +Currently **Admin Read & Write** or **Admin Read only** permission is required to interact with [R2 Data Catalog](/r2/data-catalog/). ::: diff --git a/src/content/docs/r2/data-catalog/config-examples/pyiceberg.mdx b/src/content/docs/r2/data-catalog/config-examples/pyiceberg.mdx index ad973b2fc67745d..f79e4849009cc21 100644 --- a/src/content/docs/r2/data-catalog/config-examples/pyiceberg.mdx +++ b/src/content/docs/r2/data-catalog/config-examples/pyiceberg.mdx @@ -8,8 +8,8 @@ Below is an example of using [PyIceberg](https://py.iceberg.apache.org/) to conn ## Prerequisites - Sign up for a [Cloudflare account](https://dash.cloudflare.com/sign-up/workers-and-pages). -- Create an [R2 bucket](/r2/buckets/) and enable the data catalog. -- Create an [R2 API token](/r2/api/tokens/) with both [R2 and data catalog permissions](/r2/api/tokens/#permissions). +- [Create an R2 bucket](/r2/buckets/create-buckets/) and enable the data catalog. +- [Create an R2 API token](/r2/api/tokens/) with both [R2 and data catalog permissions](/r2/api/tokens/#permissions). - Install the [PyIceberg](https://py.iceberg.apache.org/#installation) and [PyArrow](https://arrow.apache.org/docs/python/install.html) libraries. ## Example usage diff --git a/src/content/docs/r2/data-catalog/config-examples/snowflake.mdx b/src/content/docs/r2/data-catalog/config-examples/snowflake.mdx index 8f04b79d32e1dac..2aa71e16c16dc13 100644 --- a/src/content/docs/r2/data-catalog/config-examples/snowflake.mdx +++ b/src/content/docs/r2/data-catalog/config-examples/snowflake.mdx @@ -8,13 +8,13 @@ Below is an example of using [Snowflake](https://docs.snowflake.com/en/user-guid ## Prerequisites - Sign up for a [Cloudflare account](https://dash.cloudflare.com/sign-up/workers-and-pages). -- Create an [R2 bucket](/r2/buckets/) and enable the data catalog. -- Create an [R2 API token](/r2/api/tokens/) with both [R2 and data catalog permissions](/r2/api/tokens/#permissions). +- [Create an R2 bucket](/r2/buckets/create-buckets/) and enable the data catalog. +- [Create an R2 API token](/r2/api/tokens/) with both [R2 and data catalog permissions](/r2/api/tokens/#permissions). - A [Snowflake](https://www.snowflake.com/) account with the necessary privileges to create external volumes and catalog integrations. ## Example usage -In your Snowflake [SQL worksheet](https://docs.snowflake.com/en/user-guide/ui-snowsight-worksheets-gs) or [notebook](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks) run the following commands: +In your Snowflake [SQL worksheet](https://docs.snowflake.com/en/user-guide/ui-snowsight-worksheets-gs) or [notebook](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks), run the following commands: ```sql -- Create a database (if you don't already have one) to organize your external data diff --git a/src/content/docs/r2/data-catalog/config-examples/spark.mdx b/src/content/docs/r2/data-catalog/config-examples/spark.mdx index db4532e5df5cfd8..7ae24babd14501d 100644 --- a/src/content/docs/r2/data-catalog/config-examples/spark.mdx +++ b/src/content/docs/r2/data-catalog/config-examples/spark.mdx @@ -3,20 +3,25 @@ title: Spark pcx_content_type: example --- -Below is an example of how you can build an [Apache Spark](https://spark.apache.org/) application (with Scala) which connects to the R2 Data Catalog. This application is built to run locally, but it can be adapted to run on a cluster. +import { FileTree } from "~/components" + + +Below is an example of how you can build an [Apache Spark](https://spark.apache.org/) application (with Scala) which connects to R2 Data Catalog. This application is built to run locally, but it can be adapted to run on a cluster. ## Prerequisites - Sign up for a [Cloudflare account](https://dash.cloudflare.com/sign-up/workers-and-pages). -- Create an [R2 bucket](/r2/buckets/) and enable the data catalog. -- Create an [R2 API token](/r2/api/tokens/) with both [R2 and data catalog permissions](/r2/api/tokens/#permissions). +- [Create an R2 bucket](/r2/buckets/create-buckets/) and enable the data catalog. +- [Create an R2 API token](/r2/api/tokens/) with both [R2 and data catalog permissions](/r2/api/tokens/#permissions). - Install Java 17, Spark 3.5.3, and SBT 1.10.11 - Note: The specific versions of tools are critical for getting things to work in this example. - Tip: [“SDKMAN”](https://sdkman.io/) is a convenient package manager for installing SDKs. ## Example usage -To start, create a new empty project directory somewhere on your machine. Inside that directory, create the following file at `src/main/scala/com/example/R2DataCatalogDemo.scala`. This will serve as the main entry point for your Spark application. +To start, create a new empty project directory somewhere on your machine. + +Inside that directory, create the following file at `src/main/scala/com/example/R2DataCatalogDemo.scala`. This will serve as the main entry point for your Spark application. ```java package com.example @@ -105,7 +110,7 @@ To enable the [sbt-assembly plugin](https://github.com/sbt/sbt-assembly?tab=read addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "1.2.0") ``` -Make sure Java, Spark, and sbt are installed and available in your shell. If you're using SDKMAN, you can install them as shown below: +Make sure Java, Spark, and sbt are installed and available in your shell. If you are using SDKMAN, you can install them as shown below: ```bash sdk install java 17.0.14-amzn @@ -121,7 +126,7 @@ sbt clean assembly After building, the output JAR should be located at `target/scala-2.12/R2DataCatalogDemo-assembly-1.0.jar`. -To run the application, you'll use `spark-submit`. Below is an example shell script (`submit.sh`) that includes the necessary Java compatability flags for Spark on Java 17: +To run the application, you will use `spark-submit`. Below is an example shell script (`submit.sh`) that includes the necessary Java compatability flags for Spark on Java 17: ``` # We need to set these "--add-opens" so that Spark can run on Java 17 (it needs access to @@ -142,23 +147,22 @@ chmod +x submit.sh At this point, your project directory should be structured like this: -``` -. -├── Makefile -├── README.md -├── build.sbt -├── project -│ ├── assembly.sbt -│ ├── build.properties -│ └── project -├── spark-submit.sh -└── src - └── main - └── scala - └── com - └── example - └── R2DataCatalogDemo.scala -``` + +- Makefile +- README.md +- build.sbt +- project + - assembly.sbt + - build.properties + - project +- spark-submit.sh +- src + - main + - scala + - com + - example + - R2DataCatalogDemo.scala + Before submitting the job, make sure you have the required environment variable set for your catalog URI, warehouse, and [Cloudflare API token](/r2/api/tokens/). @@ -168,7 +172,7 @@ export WAREHOUSE= export TOKEN= ``` -You're now ready to run the job: +You are now ready to run the job: ```bash ./submit.sh diff --git a/src/content/docs/r2/data-catalog/get-started.mdx b/src/content/docs/r2/data-catalog/get-started.mdx index c4d276c50f7661b..3a56c312fd42013 100644 --- a/src/content/docs/r2/data-catalog/get-started.mdx +++ b/src/content/docs/r2/data-catalog/get-started.mdx @@ -1,6 +1,6 @@ --- pcx_content_type: get-started -title: Get started +title: Getting started head: [] sidebar: order: 2 @@ -44,7 +44,7 @@ This guide will instruct you through: npx wrangler login ``` -2. Then, enable the catalog on your chosen R2 bucket: +2. Enable the catalog on your chosen R2 bucket: ``` npx wrangler r2 bucket r2-data-catalog-tutorial @@ -104,20 +104,20 @@ Iceberg clients (including [PyIceberg](https://py.iceberg.apache.org/)) must aut 6. Select **Create API Token**. -7. Note the **Token value**, you will need this. +7. Note the **Token value**. ## 4. Install uv -Next, you'll need to install a Python package manager, in this guide we'll be using [uv](https://docs.astral.sh/uv/). If you don't already have uv installed, follow the [installing uv guide](https://docs.astral.sh/uv/getting-started/installation/). +You need to install a Python package manager. In this guide, use [uv](https://docs.astral.sh/uv/). If you do not already have uv installed, follow the [installing uv guide](https://docs.astral.sh/uv/getting-started/installation/). ## 5. Install marimo -We'll be using [marimo](https://github.com/marimo-team/marimo) as a Python notebook. +We will use [marimo](https://github.com/marimo-team/marimo) as a Python notebook. -1. Create a directory where our notebook will live: +1. Create a directory where our notebook will be stored: ``` mkdir r2-data-catalog-notebook @@ -269,7 +269,7 @@ We'll be using [marimo](https://github.com/marimo-team/marimo) as a Python noteb app.run() ``` -3. Replace the `CATALOG_URI`, `WAREHOUSE` and `TOKEN` variables with your values from sections **2** and **3** respectively. +3. Replace the `CATALOG_URI`, `WAREHOUSE`, and `TOKEN` variables with your values from sections **2** and **3** respectively. In the Python notebook above, you: @@ -286,7 +286,7 @@ In the Python notebook above, you: diff --git a/src/content/docs/r2/data-catalog/index.mdx b/src/content/docs/r2/data-catalog/index.mdx index 300d59214ba2c85..164c58be095e968 100644 --- a/src/content/docs/r2/data-catalog/index.mdx +++ b/src/content/docs/r2/data-catalog/index.mdx @@ -19,7 +19,7 @@ R2 Data Catalog is a managed [Apache Iceberg](https://iceberg.apache.org/) data R2 Data Catalog makes it easy to turn an R2 bucket into a data warehouse or lakehouse for a variety of analytical workloads including log analytics, business intelligence, and data pipelines. R2's zero-egress fee model means that data users and consumers can access and analyze data from different clouds, data platforms, or regions without incurring transfer costs. -Refer to the [get started guide](/r2/data-catalog/get-started/) to start with R2 Data Catalog. +To get started with R2 Data Catalog, refer to the [R2 Data Catalog: Getting started](/r2/data-catalog/get-started/). ## What is Apache Iceberg? @@ -49,7 +49,7 @@ Similarly, data catalogs ensure consistent, coordinated access, which allows mul diff --git a/src/content/docs/r2/data-catalog/managing-catalogs.mdx b/src/content/docs/r2/data-catalog/manage-catalogs.mdx similarity index 94% rename from src/content/docs/r2/data-catalog/managing-catalogs.mdx rename to src/content/docs/r2/data-catalog/manage-catalogs.mdx index dad8406450cfb60..9a97c6d9858c84d 100644 --- a/src/content/docs/r2/data-catalog/managing-catalogs.mdx +++ b/src/content/docs/r2/data-catalog/manage-catalogs.mdx @@ -1,6 +1,6 @@ --- pcx_content_type: configuration -title: Managing catalogs +title: Manage catalogs description: Understand how to manage Iceberg REST catalogs associated with R2 buckets sidebar: order: 3 @@ -18,7 +18,10 @@ import { LinkCard, } from "~/components"; -Learn how to enable and disable [R2 Data Catalog](/r2/data-catalog/) on your buckets and authenticate Iceberg engines using API tokens. +Learn how to: + +- Enable and disable [R2 Data Catalog](/r2/data-catalog/) on your buckets. +- Authenticate Iceberg engines using API tokens. ## Enable R2 Data Catalog on a bucket @@ -72,7 +75,7 @@ To connect your Iceberg engine to R2 Data Catalog, you will need a Cloudflare AP 2. Copy the **Token value** from your new API token. 3. In your engine configuration, provide this token as a bearer token. Internally, this token will be sent as: - + ``` Authorization: Bearer ``` diff --git a/src/content/docs/r2/pricing.mdx b/src/content/docs/r2/pricing.mdx index 01d72f995f797d4..832ae94e325682a 100644 --- a/src/content/docs/r2/pricing.mdx +++ b/src/content/docs/r2/pricing.mdx @@ -24,13 +24,13 @@ To learn about potential cost savings from using R2, refer to the [R2 pricing ca ## R2 pricing -| | Standard storage | Infrequent Access storage | -| ---------------------------------- | ------------------------ | ------------------------------------------------------ | -| Storage | $0.015 / GB-month | $0.01 / GB-month | -| Class A Operations | $4.50 / million requests | $9.00 / million requests | -| Class B Operations | $0.36 / million requests | $0.90 / million requests | -| Data Retrieval (processing) | None | $0.01 / GB | -| Egress (data transfer to Internet) | Free [^1] | Free [^1] | +| | Standard storage | Infrequent Access storage | +| ---------------------------------- | ------------------------ | ------------------------------------------------------- | +| Storage | $0.015 / GB-month | $0.01 / GB-month | +| Class A Operations | $4.50 / million requests | $9.00 / million requests | +| Class B Operations | $0.36 / million requests | $0.90 / million requests | +| Data Retrieval (processing) | None | $0.01 / GB | +| Egress (data transfer to Internet) | Free [^1] | Free [^1] | ### Free tier @@ -82,7 +82,7 @@ For objects stored in Infrequent Access storage, you will be charged for the obj ## R2 Data Catalog pricing -R2 Data Catalog is in **public beta**, and any developer with [R2 subscription](/r2/pricing/) can start using it. Currently, outside of standard R2 storage and operations, you will not be billed for your use of R2 Data Catalog. We'll provide at least 30 days notice before we make any changes or start charging for usage +R2 Data Catalog is in **public beta**, and any developer with an [R2 subscription](/r2/pricing/) can start using it. Currently, outside of standard R2 storage and operations, you will not be billed for your use of R2 Data Catalog. We will provide at least 30 days' notice before we make any changes or start charging for usage. To learn more about our thinking on future pricing, refer to the [R2 Data Catalog announcement blog](https://blog.cloudflare.com/r2-data-catalog-public-beta). From 9b8cc066bea93e226d9c264ed0820454e5270b1b Mon Sep 17 00:00:00 2001 From: Phillip Jones Date: Mon, 7 Apr 2025 20:29:24 -0700 Subject: [PATCH 5/9] Fix PR comments/typos. --- .../r2/data-catalog/config-examples/pyiceberg.mdx | 2 +- .../r2/data-catalog/config-examples/snowflake.mdx | 2 +- .../docs/r2/data-catalog/config-examples/spark.mdx | 2 +- src/content/docs/r2/data-catalog/get-started.mdx | 12 ++++++------ 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/content/docs/r2/data-catalog/config-examples/pyiceberg.mdx b/src/content/docs/r2/data-catalog/config-examples/pyiceberg.mdx index f79e4849009cc21..dea9566b4b4fe2f 100644 --- a/src/content/docs/r2/data-catalog/config-examples/pyiceberg.mdx +++ b/src/content/docs/r2/data-catalog/config-examples/pyiceberg.mdx @@ -8,7 +8,7 @@ Below is an example of using [PyIceberg](https://py.iceberg.apache.org/) to conn ## Prerequisites - Sign up for a [Cloudflare account](https://dash.cloudflare.com/sign-up/workers-and-pages). -- [Create an R2 bucket](/r2/buckets/create-buckets/) and enable the data catalog. +- [Create an R2 bucket](/r2/buckets/create-buckets/) and [enable the data catalog](/r2/data-catalog/manage-catalogs/#enable-r2-data-catalog-on-a-bucket). - [Create an R2 API token](/r2/api/tokens/) with both [R2 and data catalog permissions](/r2/api/tokens/#permissions). - Install the [PyIceberg](https://py.iceberg.apache.org/#installation) and [PyArrow](https://arrow.apache.org/docs/python/install.html) libraries. diff --git a/src/content/docs/r2/data-catalog/config-examples/snowflake.mdx b/src/content/docs/r2/data-catalog/config-examples/snowflake.mdx index 2aa71e16c16dc13..2e14079025fdbd6 100644 --- a/src/content/docs/r2/data-catalog/config-examples/snowflake.mdx +++ b/src/content/docs/r2/data-catalog/config-examples/snowflake.mdx @@ -8,7 +8,7 @@ Below is an example of using [Snowflake](https://docs.snowflake.com/en/user-guid ## Prerequisites - Sign up for a [Cloudflare account](https://dash.cloudflare.com/sign-up/workers-and-pages). -- [Create an R2 bucket](/r2/buckets/create-buckets/) and enable the data catalog. +- [Create an R2 bucket](/r2/buckets/create-buckets/) and [enable the data catalog](/r2/data-catalog/manage-catalogs/#enable-r2-data-catalog-on-a-bucket). - [Create an R2 API token](/r2/api/tokens/) with both [R2 and data catalog permissions](/r2/api/tokens/#permissions). - A [Snowflake](https://www.snowflake.com/) account with the necessary privileges to create external volumes and catalog integrations. diff --git a/src/content/docs/r2/data-catalog/config-examples/spark.mdx b/src/content/docs/r2/data-catalog/config-examples/spark.mdx index 7ae24babd14501d..2c6cf92a9008c75 100644 --- a/src/content/docs/r2/data-catalog/config-examples/spark.mdx +++ b/src/content/docs/r2/data-catalog/config-examples/spark.mdx @@ -11,7 +11,7 @@ Below is an example of how you can build an [Apache Spark](https://spark.apache. ## Prerequisites - Sign up for a [Cloudflare account](https://dash.cloudflare.com/sign-up/workers-and-pages). -- [Create an R2 bucket](/r2/buckets/create-buckets/) and enable the data catalog. +- [Create an R2 bucket](/r2/buckets/create-buckets/) and [enable the data catalog](/r2/data-catalog/manage-catalogs/#enable-r2-data-catalog-on-a-bucket). - [Create an R2 API token](/r2/api/tokens/) with both [R2 and data catalog permissions](/r2/api/tokens/#permissions). - Install Java 17, Spark 3.5.3, and SBT 1.10.11 - Note: The specific versions of tools are critical for getting things to work in this example. diff --git a/src/content/docs/r2/data-catalog/get-started.mdx b/src/content/docs/r2/data-catalog/get-started.mdx index 3a56c312fd42013..deef04fcc9a1359 100644 --- a/src/content/docs/r2/data-catalog/get-started.mdx +++ b/src/content/docs/r2/data-catalog/get-started.mdx @@ -44,10 +44,10 @@ This guide will instruct you through: npx wrangler login ``` -2. Enable the catalog on your chosen R2 bucket: +2. Create an R2 bucket: ``` - npx wrangler r2 bucket r2-data-catalog-tutorial + npx wrangler r2 bucket create r2-data-catalog-tutorial ``` @@ -57,9 +57,9 @@ This guide will instruct you through: 1. From the Cloudflare dashboard, select **R2 Object Storage** from the sidebar. -2. Select the bucket you want to enable as a data catalog. -3. Switch to the **Settings** tab, scroll down to **R2 Data Catalog**, and select **Enable**. -4. Once enabled, note the **Catalog URI** and **Warehouse name**. +2. Select **Create bucket**. +3. Enter the bucket name: r2-data-catalog-tutorial +4. Select **Create bucket**. @@ -80,7 +80,7 @@ Then, enable the catalog on your chosen R2 bucket: 1. From the Cloudflare dashboard, select **R2 Object Storage** from the sidebar. -2. Select the bucket you want to enable as a data catalog. +2. Select the bucket: r2-data-catalog-tutorial. 3. Switch to the **Settings** tab, scroll down to **R2 Data Catalog**, and select **Enable**. 4. Once enabled, note the **Catalog URI** and **Warehouse name**. From 492e01c3674bed0049bf8fa96643b610a85673a0 Mon Sep 17 00:00:00 2001 From: Phillip Jones Date: Mon, 7 Apr 2025 21:35:49 -0700 Subject: [PATCH 6/9] Added PySpark example configuration. --- .../r2/2025-04-10-r2-data-catalog-beta.mdx | 4 +- .../config-examples/spark-python.mdx | 71 +++++++++++++++++++ .../{spark.mdx => spark-scala.mdx} | 2 +- src/content/docs/r2/data-catalog/index.mdx | 2 +- 4 files changed, 75 insertions(+), 4 deletions(-) create mode 100644 src/content/docs/r2/data-catalog/config-examples/spark-python.mdx rename src/content/docs/r2/data-catalog/config-examples/{spark.mdx => spark-scala.mdx} (99%) diff --git a/src/content/changelog/r2/2025-04-10-r2-data-catalog-beta.mdx b/src/content/changelog/r2/2025-04-10-r2-data-catalog-beta.mdx index 165deb633da91a1..de53ab195849e0e 100644 --- a/src/content/changelog/r2/2025-04-10-r2-data-catalog-beta.mdx +++ b/src/content/changelog/r2/2025-04-10-r2-data-catalog-beta.mdx @@ -9,9 +9,9 @@ hidden: true Today, we're launching [R2 Data Catalog](/r2/data-catalog/) in open beta, a managed Apache Iceberg catalog built directly into your [Cloudflare R2](/r2/) bucket. -If you're not already familiar with it, [Apache Iceberg](https://iceberg.apache.org/) is an open table format designed to handle large-scale analytics datasets stored in object storage, offering ACID transactions and schema evolution. R2 Data Catalog exposes a standard Iceberg REST catalog interface, so you can connect engines like [Spark](/r2/data-catalog/config-examples/spark/), [Snowflake](/r2/data-catalog/config-examples/snowflake/), and [PyIceberg](/r2/data-catalog/config-examples/pyiceberg/) to start querying your tables using the tools you already know. +If you're not already familiar with it, [Apache Iceberg](https://iceberg.apache.org/) is an open table format designed to handle large-scale analytics datasets stored in object storage, offering ACID transactions and schema evolution. R2 Data Catalog exposes a standard Iceberg REST catalog interface, so you can connect engines like [Spark](/r2/data-catalog/config-examples/spark-scala/), [Snowflake](/r2/data-catalog/config-examples/snowflake/), and [PyIceberg](/r2/data-catalog/config-examples/pyiceberg/) to start querying your tables using the tools you already know. -To enable a data catalog on your R2 bucket, find **R2 Data Catalog** in your buckets settings in the dashboard or run: +To enable a data catalog on your R2 bucket, find **R2 Data Catalog** in your buckets settings in the dashboard, or run: ```bash npx wrangler r2 bucket catalog enable my-bucket diff --git a/src/content/docs/r2/data-catalog/config-examples/spark-python.mdx b/src/content/docs/r2/data-catalog/config-examples/spark-python.mdx new file mode 100644 index 000000000000000..9d8f84ee73a9931 --- /dev/null +++ b/src/content/docs/r2/data-catalog/config-examples/spark-python.mdx @@ -0,0 +1,71 @@ +--- +title: Spark (PySpark) +pcx_content_type: example +--- + +Below is an example of using [PySpark](https://spark.apache.org/docs/latest/api/python/index.html) to connect to R2 Data Catalog. + +## Prerequisites + +- Sign up for a [Cloudflare account](https://dash.cloudflare.com/sign-up/workers-and-pages). +- [Create an R2 bucket](/r2/buckets/create-buckets/) and [enable the data catalog](/r2/data-catalog/manage-catalogs/#enable-r2-data-catalog-on-a-bucket). +- [Create an R2 API token](/r2/api/tokens/) with both [R2 and data catalog permissions](/r2/api/tokens/#permissions). +- Install the [PySpark](https://spark.apache.org/docs/latest/api/python/getting_started/install.html) library. + +## Example usage + +```py +from pyspark.sql import SparkSession + +# Define catalog connection details (replace variables) +WAREHOUSE = "" +TOKEN = "" +CATALOG_URI = "" + +# Build Spark session with Iceberg configurations +spark = SparkSession.builder \ + .appName("R2DataCatalogExample") \ + .config('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,org.apache.iceberg:iceberg-aws-bundle:1.6.1') \ + .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \ + .config("spark.sql.catalog.my_catalog", "org.apache.iceberg.spark.SparkCatalog") \ + .config("spark.sql.catalog.my_catalog.type", "rest") \ + .config("spark.sql.catalog.my_catalog.uri", CATALOG_URI) \ + .config("spark.sql.catalog.my_catalog.warehouse", WAREHOUSE) \ + .config("spark.sql.catalog.my_catalog.token", TOKEN) \ + .config("spark.sql.catalog.my_catalog.header.X-Iceberg-Access-Delegation", "vended-credentials") \ + .config("spark.sql.catalog.my_catalog.s3.remote-signing-enabled", "false") \ + .config("spark.sql.defaultCatalog", "my_catalog") \ + .getOrCreate() +spark.sql("USE my_catalog") + +# Create namespace if it does not exist +spark.sql("CREATE NAMESPACE IF NOT EXISTS default") + +# Create a table in the namespace using Iceberg +spark.sql(""" + CREATE TABLE IF NOT EXISTS default.my_table ( + id BIGINT, + name STRING + ) + USING iceberg +""") + +# Create a simple DataFrame +df = spark.createDataFrame( + [(1, "Alice"), (2, "Bob"), (3, "Charlie")], + ["id", "name"] +) + +# Write the DataFrame to the Iceberg table +df.write \ + .format("iceberg") \ + .mode("append") \ + .save("default.my_table") + +# Read the data back from the Iceberg table +result_df = spark.read \ + .format("iceberg") \ + .load("default.my_table") + +result_df.show() +``` diff --git a/src/content/docs/r2/data-catalog/config-examples/spark.mdx b/src/content/docs/r2/data-catalog/config-examples/spark-scala.mdx similarity index 99% rename from src/content/docs/r2/data-catalog/config-examples/spark.mdx rename to src/content/docs/r2/data-catalog/config-examples/spark-scala.mdx index 2c6cf92a9008c75..245d62a4e47a5fe 100644 --- a/src/content/docs/r2/data-catalog/config-examples/spark.mdx +++ b/src/content/docs/r2/data-catalog/config-examples/spark-scala.mdx @@ -1,5 +1,5 @@ --- -title: Spark +title: Spark (Scala) pcx_content_type: example --- diff --git a/src/content/docs/r2/data-catalog/index.mdx b/src/content/docs/r2/data-catalog/index.mdx index 164c58be095e968..a40bb195ffcd810 100644 --- a/src/content/docs/r2/data-catalog/index.mdx +++ b/src/content/docs/r2/data-catalog/index.mdx @@ -15,7 +15,7 @@ import { Render, LinkCard } from "~/components"; R2 Data Catalog is in **public beta**, and any developer with an [R2 subscription](/r2/pricing/) can start using it. Currently, outside of standard R2 storage and operations, you will not be billed for your use of R2 Data Catalog. ::: -R2 Data Catalog is a managed [Apache Iceberg](https://iceberg.apache.org/) data catalog built directly into your R2 bucket. It exposes a standard Iceberg REST catalog interface, so you can connect the engines you already use, like [Spark](/r2/data-catalog/config-examples/spark/), [Snowflake](/r2/data-catalog/config-examples/snowflake/), and [PyIceberg](/r2/data-catalog/config-examples/pyiceberg/). +R2 Data Catalog is a managed [Apache Iceberg](https://iceberg.apache.org/) data catalog built directly into your R2 bucket. It exposes a standard Iceberg REST catalog interface, so you can connect the engines you already use, like [Spark](/r2/data-catalog/config-examples/spark-scala/), [Snowflake](/r2/data-catalog/config-examples/snowflake/), and [PyIceberg](/r2/data-catalog/config-examples/pyiceberg/). R2 Data Catalog makes it easy to turn an R2 bucket into a data warehouse or lakehouse for a variety of analytical workloads including log analytics, business intelligence, and data pipelines. R2's zero-egress fee model means that data users and consumers can access and analyze data from different clouds, data platforms, or regions without incurring transfer costs. From e9c21fdaf2d4346fb544edd598620c08f3c318c4 Mon Sep 17 00:00:00 2001 From: Jun Lee Date: Tue, 8 Apr 2025 17:29:05 +0100 Subject: [PATCH 7/9] Update src/content/docs/r2/data-catalog/config-examples/spark-scala.mdx --- .../docs/r2/data-catalog/config-examples/spark-scala.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/content/docs/r2/data-catalog/config-examples/spark-scala.mdx b/src/content/docs/r2/data-catalog/config-examples/spark-scala.mdx index 245d62a4e47a5fe..434f8d032e33533 100644 --- a/src/content/docs/r2/data-catalog/config-examples/spark-scala.mdx +++ b/src/content/docs/r2/data-catalog/config-examples/spark-scala.mdx @@ -68,7 +68,7 @@ object R2DataCatalogDemo { } ``` -For building this application and managing dependencies, we'll use [sbt (“simple build tool”)](https://www.scala-sbt.org/). The following is an example `build.sbt` file to place at the root of your project. It is configured to produce a "fat JAR", bundling all required dependencies. +For building this application and managing dependencies, we will use [sbt (“simple build tool”)](https://www.scala-sbt.org/). The following is an example `build.sbt` file to place at the root of your project. It is configured to produce a "fat JAR", bundling all required dependencies. ```java name := "R2DataCatalogDemo" From 765fad6582af56bf9bbebb17c3866fbff2bc643d Mon Sep 17 00:00:00 2001 From: Phillip Jones Date: Tue, 8 Apr 2025 16:00:19 -0700 Subject: [PATCH 8/9] Added more context for data catalog auth --- src/content/docs/r2/api/tokens.mdx | 52 ++++++++++++++----- .../docs/r2/data-catalog/manage-catalogs.mdx | 22 ++++---- 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/src/content/docs/r2/api/tokens.mdx b/src/content/docs/r2/api/tokens.mdx index 1f50863aca0b5df..341906a9757766e 100644 --- a/src/content/docs/r2/api/tokens.mdx +++ b/src/content/docs/r2/api/tokens.mdx @@ -45,16 +45,16 @@ Jurisdictional buckets can only be accessed via the corresponding jurisdictional ## Permissions -| Permission | Description | -| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Admin Read & Write | Allows the ability to create, list, and delete buckets, edit bucket configuration, read, write, and list objects, and read and write to data catalog tables and associated metadata. | -| Admin Read only | Allows the ability to list buckets and view bucket configuration, read and list objects, and read from the data catalog tables and associated metadata. | -| Object Read & Write | Allows the ability to read, write, and list objects in specific buckets. | -| Object Read only | Allows the ability to read and list objects in specific buckets. | +| Permission | Description | +| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| Admin Read & Write | Allows the ability to create, list, and delete buckets, edit bucket configuration, read, write, and list objects, and read and write to data catalog tables and associated metadata. | +| Admin Read only | Allows the ability to list buckets and view bucket configuration, read and list objects, and read from the data catalog tables and associated metadata. | +| Object Read & Write | Allows the ability to read, write, and list objects in specific buckets. | +| Object Read only | Allows the ability to read and list objects in specific buckets. | :::note -Currently **Admin Read & Write** or **Admin Read only** permission is required to interact with [R2 Data Catalog](/r2/data-catalog/). +Currently **Admin Read & Write** or **Admin Read only** permission is required to use [R2 Data Catalog](/r2/data-catalog/). ::: @@ -96,7 +96,7 @@ All buckets in an account are represented as: #### Permission groups -Determine what [permission groups](/fundamentals/api/how-to/create-via-api/#permission-groups) should be applied. There are four relevant permission groups for R2. +Determine what [permission groups](/fundamentals/api/how-to/create-via-api/#permission-groups) should be applied. @@ -107,7 +107,7 @@ Determine what [permission groups](/fundamentals/api/how-to/create-via-api/#perm Resource @@ -128,7 +129,8 @@ Determine what [permission groups](/fundamentals/api/how-to/create-via-api/#perm Account @@ -139,7 +141,7 @@ Determine what [permission groups](/fundamentals/api/how-to/create-via-api/#perm Bucket @@ -150,7 +152,31 @@ Determine what [permission groups](/fundamentals/api/how-to/create-via-api/#perm Bucket + + + + + + + + + + diff --git a/src/content/docs/r2/data-catalog/manage-catalogs.mdx b/src/content/docs/r2/data-catalog/manage-catalogs.mdx index 9a97c6d9858c84d..aa5a98cac4d8c4d 100644 --- a/src/content/docs/r2/data-catalog/manage-catalogs.mdx +++ b/src/content/docs/r2/data-catalog/manage-catalogs.mdx @@ -68,20 +68,20 @@ npx wrangler r2 bucket catalog disable ## Authenticate your Iceberg engine -To connect your Iceberg engine to R2 Data Catalog, you will need a Cloudflare API token with both [R2 and R2 Data Catalog permissions](/r2/api/tokens/#permissions). Iceberg engines that support the REST catalog let you supply this token to authenticate with R2 Data Catalog. +To connect your Iceberg engine to R2 Data Catalog, you must provide a Cloudflare API token with **both** R2 Data Catalog permissions and R2 storage permissions. Iceberg engines interact with R2 Data Catalog to perform table operations. The catalog also provides engines with SigV4 credentials, which are required to access the underlying data files stored in R2. - -1. Create an [R2 API token](/r2/api/tokens/#permissions) with **Admin Read & Write** or **Admin Read only** permission. -2. Copy the **Token value** from your new API token. -3. In your engine configuration, provide this token as a bearer token. - Internally, this token will be sent as: +### Create API token in the dashboard - ``` - Authorization: Bearer - ``` +Create an [R2 API token](/r2/api/tokens/#permissions) with **Admin Read & Write** or **Admin Read only** permissions. These permissions include both: - in HTTP requests to your bucket's data catalog. - +- Access to R2 Data Catalog (read-only or read/write, depending on chosen permission) +- Access to R2 storage (read-only or read/write, depending on chosen permission) + +Providing the resulting token value to your Iceberg engine gives it the ability to manage catalog metadata and handle data operations (reads or writes to R2). + +### Create API token via API + +To learn how to create API tokens for R2 Data Catalog using the API, including required permission groups and usage examples, refer to the [Create API tokens via API documentation](/r2/api/tokens/#create-api-tokens-via-api). ## Learn more From 7fa96e3bda5918a126aecf63f11b3e78d6c7b018 Mon Sep 17 00:00:00 2001 From: Phillip Jones Date: Wed, 9 Apr 2025 14:08:47 -0700 Subject: [PATCH 9/9] Add access policy example for r2 data catalog API tokens --- src/content/docs/r2/api/tokens.mdx | 4 +-- .../docs/r2/data-catalog/manage-catalogs.mdx | 29 ++++++++++++++++++- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/content/docs/r2/api/tokens.mdx b/src/content/docs/r2/api/tokens.mdx index 341906a9757766e..f7bba2c8cb69793 100644 --- a/src/content/docs/r2/api/tokens.mdx +++ b/src/content/docs/r2/api/tokens.mdx @@ -160,7 +160,7 @@ Determine what [permission groups](/fundamentals/api/how-to/create-via-api/#perm Workers R2 Data Catalog Write
- Permission + Description
@@ -117,7 +117,8 @@ Determine what [permission groups](/fundamentals/api/how-to/create-via-api/#perm Account - Admin Read & Write + Can create, delete, and list buckets, edit bucket configuration, and + read, write, and list objects.
- Admin Read only + Can list buckets and view bucket configuration, and read and list + objects.
- Object Read & Write + Can read, write, and list objects in buckets.
- Object Read only + Can read and list objects in buckets. +
+ Workers R2 Data Catalog Write + + Bucket + + Can read from and write to data catalogs. This permission allows + access to the Iceberg REST catalog interface. +
+ Workers R2 Data Catalog Read + + Bucket + + Can read from data catalogs. This permission allows read-only + access to the Iceberg REST catalog interface.
- Bucket + Account Can read from and write to data catalogs. This permission allows @@ -172,7 +172,7 @@ Determine what [permission groups](/fundamentals/api/how-to/create-via-api/#perm Workers R2 Data Catalog Read - Bucket + Account Can read from data catalogs. This permission allows read-only diff --git a/src/content/docs/r2/data-catalog/manage-catalogs.mdx b/src/content/docs/r2/data-catalog/manage-catalogs.mdx index aa5a98cac4d8c4d..962a3f8a05217d8 100644 --- a/src/content/docs/r2/data-catalog/manage-catalogs.mdx +++ b/src/content/docs/r2/data-catalog/manage-catalogs.mdx @@ -81,7 +81,34 @@ Providing the resulting token value to your Iceberg engine gives it the ability ### Create API token via API -To learn how to create API tokens for R2 Data Catalog using the API, including required permission groups and usage examples, refer to the [Create API tokens via API documentation](/r2/api/tokens/#create-api-tokens-via-api). +To create an API token programmatically for use with R2 Data Catalog, you'll need to specify both R2 Data Catalog and R2 storage permission groups in your [Access Policy](/r2/api/tokens/#access-policy). + +#### Example Access Policy + +```json +[ + { + "id": "f267e341f3dd4697bd3b9f71dd96247f", + "effect": "allow", + "resources": { + "com.cloudflare.edge.r2.bucket.4793d734c0b8e484dfc37ec392b5fa8a_default_my-bucket": "*", + "com.cloudflare.edge.r2.bucket.4793d734c0b8e484dfc37ec392b5fa8a_eu_my-eu-bucket": "*" + }, + "permission_groups": [ + { + "id": "d229766a2f7f4d299f20eaa8c9b1fde9", + "name": "Workers R2 Data Catalog Write" + }, + { + "id": "2efd5506f9c8494dacb1fa10a3e7d5b6", + "name": "Workers R2 Storage Bucket Item Write" + } + ] + } +] +``` + +To learn more about how to create API tokens for R2 Data Catalog using the API, including required permission groups and usage examples, refer to the [Create API tokens via API documentation](/r2/api/tokens/#create-api-tokens-via-api). ## Learn more