Release 0.5.0.

kokoro-team · kokoro-team · commit a88649d87532 · 2025-05-16T06:48:41.000-07:00
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,6 +1,6 @@
 # Release Notes
 
-## Next
+## 0.5.0 - 2025-05-16
 
 * PR #72: feat: Dynamic column qualifier support for reading from Bigtable
 
diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ In Java and Scala applications, you can use different dependency management
 tools (e.g., Maven, sbt, or Gradle) to access the
 connector `com.google.cloud.spark.bigtable:spark-bigtable_2.13:<version>` or
 `com.google.cloud.spark.bigtable:spark-bigtable_2.12:<version>` (current
-`<version>` is `0.4.0`) and package it inside your application JAR
+`<version>` is `0.5.0`) and package it inside your application JAR
 using libraries such as Maven Shade Plugin. For PySpark applications, you can
 use the `--jars` flag to pass the GCS address of the connector when submitting
 it.
@@ -31,7 +31,7 @@ For Maven, you can add the following snippet to your `pom.xml` file:
 <dependency>
   <groupId>com.google.cloud.spark.bigtable</groupId>
   <artifactId>spark-bigtable_2.13</artifactId>
-  <version>0.4.0</version>
+  <version>0.5.0</version>
 </dependency>
 ```
 
@@ -40,20 +40,20 @@ For Maven, you can add the following snippet to your `pom.xml` file:
 <dependency>
   <groupId>com.google.cloud.spark.bigtable</groupId>
   <artifactId>spark-bigtable_2.12</artifactId>
-  <version>0.4.0</version>
+  <version>0.5.0</version>
 </dependency>
 ```
 
 For sbt, you can add the following to your `build.sbt` file:
 
 ```
 // for scala 2.13
-libraryDependencies += "com.google.cloud.spark.bigtable" % "spark-bigtable_2.13" % "0.4.0"
+libraryDependencies += "com.google.cloud.spark.bigtable" % "spark-bigtable_2.13" % "0.5.0"
 ```
 
 ```
 // for scala 2.12
-libraryDependencies += "com.google.cloud.spark.bigtable" % "spark-bigtable_2.12" % "0.4.0"
+libraryDependencies += "com.google.cloud.spark.bigtable" % "spark-bigtable_2.12" % "0.5.0"
 ```
 
 Finally, you can add the following to your `build.gradle` file when using
@@ -62,14 +62,14 @@ Gradle:
 ```
 // for scala 2.13
 dependencies {
-implementation group: 'com.google.cloud.bigtable', name: 'spark-bigtable_2.13', version: '0.4.0'
+implementation group: 'com.google.cloud.bigtable', name: 'spark-bigtable_2.13', version: '0.5.0'
 }
 ```
 
 ```
 // for scala 2.12
 dependencies {
-implementation group: 'com.google.cloud.bigtable', name: 'spark-bigtable_2.12', version: '0.4.0'
+implementation group: 'com.google.cloud.bigtable', name: 'spark-bigtable_2.12', version: '0.5.0'
 }
 ```
 
@@ -157,6 +157,44 @@ columns and the `id` column is used as the row key. Note that you could also
 specify *compound* row keys,
 which are created by concatenating multiple DataFrame columns together.
 
+#### Catalog with variable column definitions
+
+You can also use `regexColumns` to match multiple columns in the same column
+family to a single data frame column. This can be useful in scenarios where
+you don't know the exact column qualifiers for your data ahead of time, like
+when your column qualifier is partially composed of other pieces of data.
+
+For example this catalog:
+```
+{
+  "table": {"name": "t1"},
+  "rowkey": "id_rowkey",
+  "columns": {
+    "id": {"cf": "rowkey", "col": "id_rowkey", "type": "string"},
+  },
+  "regexColumns": {
+    "metadata": {"cf": "info", "pattern": "\C*", "type": "long" }
+  }
+}
+```
+
+Would match all columns in the column family "info" and the result would be a
+DataFrame column named "metadata", where it's contents would be a Map of String
+to Long with the keys being the column qualifiers and the values are the results
+in those columns in Bigtable.
+
+A few caveats:
+
+ - The values of all matching columns must be deserializable to the type defined
+   in the catalog. If you expect to need more complex deserialization you can
+   also define the type as `bytes` and run custom deserialization logic.
+ - A catalog with regex columns cannot be used for writes.
+ - Bigtable uses [RE2](https://github.com/google/re2/wiki/Syntax) for it's regex
+   implementation, which has slight differences from other implementations.
+ - Because columns may contain arbitrary characters, including new lines, it is
+   advisable to use `\C` as the wildcard expression, since `.` will not match on
+   those
+
 ### Writing to Bigtable
 
 You can use the `bigtable` format along with specifying the Bigtable
diff --git a/pom.xml b/pom.xml
@@ -21,7 +21,7 @@
   <groupId>com.google.cloud.spark.bigtable</groupId>
   <artifactId>spark-bigtable-connector</artifactId>
   <packaging>pom</packaging>
-  <version>0.4.0</version>  <!-- ${NEXT_VERSION_FLAG} -->
+  <version>0.5.0</version>  <!-- ${NEXT_VERSION_FLAG} -->
   <name>Spark Bigtable Connector Build Parent</name>
   <description>Parent project for all the Spark Bigtable Connector artifacts</description>
   <url>https://github.com/GoogleCloudDataproc/spark-bigtable-connector</url>
diff --git a/spark-bigtable-core-it/pom.xml b/spark-bigtable-core-it/pom.xml
@@ -21,14 +21,14 @@
   <parent>
     <groupId>com.google.cloud.spark.bigtable</groupId>
     <artifactId>spark-bigtable-connector</artifactId>
-    <version>0.4.0</version>  <!-- ${NEXT_VERSION_FLAG} -->
+    <version>0.5.0</version>  <!-- ${NEXT_VERSION_FLAG} -->
     <relativePath>../</relativePath>
   </parent>
 
   <groupId>com.google.cloud.spark.bigtable</groupId>
   <artifactId>spark-bigtable-core-it</artifactId>
   <name>Google Bigtable - Spark Connector Integration Tests</name>
-  <version>0.4.0</version>  <!-- ${NEXT_VERSION_FLAG} -->
+  <version>0.5.0</version>  <!-- ${NEXT_VERSION_FLAG} -->
 
   <dependencies>
     <dependency>
@@ -52,7 +52,7 @@
     <dependency>
       <groupId>com.google.cloud.spark.bigtable</groupId>
       <artifactId>${connector.artifact.id}</artifactId>
-      <version>0.4.0</version>  <!-- ${NEXT_VERSION_FLAG} -->
+      <version>0.5.0</version>  <!-- ${NEXT_VERSION_FLAG} -->
     </dependency>
 
     <dependency>
diff --git a/spark-bigtable-core/src/main/scala/com/google/cloud/spark/bigtable/BigtableDefaultSource.scala b/spark-bigtable-core/src/main/scala/com/google/cloud/spark/bigtable/BigtableDefaultSource.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, Row => SparkRow}
 import org.apache.yetus.audience.InterfaceAudience
 
 object UserAgentInformation {
-  val CONNECTOR_VERSION = "0.4.0" // ${NEXT_VERSION_FLAG}
+  val CONNECTOR_VERSION = "0.5.0" // ${NEXT_VERSION_FLAG}
   val DATA_SOURCE_VERSION = "V1"
   val DATAFRAME_TEXT = "DF/" + DATA_SOURCE_VERSION
   val RDD_TEXT = "RDD/"
diff --git a/spark-bigtable_2.12/pom.xml b/spark-bigtable_2.12/pom.xml
@@ -21,14 +21,14 @@
   <parent>
     <groupId>com.google.cloud.spark.bigtable</groupId>
     <artifactId>spark-bigtable-connector</artifactId>
-    <version>0.4.0</version> <!-- ${NEXT_VERSION_FLAG} -->
+    <version>0.5.0</version> <!-- ${NEXT_VERSION_FLAG} -->
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <groupId>com.google.cloud.spark.bigtable</groupId>
   <artifactId>spark-bigtable_2.12</artifactId>
   <name>Google Bigtable - Apache Spark Connector</name>
-  <version>0.4.0</version> <!-- ${NEXT_VERSION_FLAG} -->
+  <version>0.5.0</version> <!-- ${NEXT_VERSION_FLAG} -->
 
   <properties>
     <scala.version>2.12.18</scala.version>
diff --git a/spark-bigtable_2.13/pom.xml b/spark-bigtable_2.13/pom.xml
@@ -21,14 +21,14 @@
   <parent>
     <groupId>com.google.cloud.spark.bigtable</groupId>
     <artifactId>spark-bigtable-connector</artifactId>
-    <version>0.4.0</version> <!-- ${NEXT_VERSION_FLAG} -->
+    <version>0.5.0</version> <!-- ${NEXT_VERSION_FLAG} -->
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <groupId>com.google.cloud.spark.bigtable</groupId>
   <artifactId>spark-bigtable_2.13</artifactId>
   <name>Google Bigtable - Apache Spark Connector for Scala 2.13</name>
-  <version>0.4.0</version> <!-- ${NEXT_VERSION_FLAG} -->
+  <version>0.5.0</version> <!-- ${NEXT_VERSION_FLAG} -->
 
   <properties>
     <scala.version>2.13.14</scala.version>