chore: Add documentation on how to use regexColumns (#74)

andre-sampaio · web-flow · commit 6b36f9015093 · 2025-05-13T16:18:32.000-04:00
* chore: Add documentation on how to use regexColumns

* typo

* another typo

* Include change for release notes

* add pattern key
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,8 @@
 
 ## Next
 
+* PR #72: feat: Dynamic column qualifier support for reading from Bigtable
+
 ## 0.4.0 - 2025-03-10
 
 * PR #52: feat: Scala Version Upgrade (2.12 -> 2.13).
diff --git a/README-template.md b/README-template.md
@@ -163,6 +163,44 @@ columns and the `id` column is used as the row key. Note that you could also
 specify *compound* row keys,
 which are created by concatenating multiple DataFrame columns together.
 
+#### Catalog with variable column definitions
+
+You can also use `regexColumns` to match multiple columns in the same column
+family to a single data frame column. This can be useful in scenarios where
+you don't know the exact column qualifiers for your data ahead of time, like
+when your column qualifier is partially composed of other pieces of data.
+
+For example this catalog:
+```
+{
+  "table": {"name": "t1"},
+  "rowkey": "id_rowkey",
+  "columns": {
+    "id": {"cf": "rowkey", "col": "id_rowkey", "type": "string"},
+  },
+  "regexColumns": {
+    "metadata": {"cf": "info", "pattern": "\C*", "type": "long" }
+  }
+}
+```
+
+Would match all columns in the column family "info" and the result would be a
+DataFrame column named "metadata", where it's contents would be a Map of String
+to Long with the keys being the column qualifiers and the values are the results
+in those columns in Bigtable.
+
+A few caveats:
+
+ - The values of all matching columns must be deserializable to the type defined
+   in the catalog. If you expect to need more complex deserialization you can
+   also define the type as `bytes` and run custom deserialization logic.
+ - A catalog with regex columns cannot be used for writes.
+ - Bigtable uses [RE2](https://github.com/google/re2/wiki/Syntax) for it's regex
+   implementation, which has slight differences from other implementations.
+ - Because columns may contain arbitrary characters, including new lines, it is
+   advisable to use `\C` as the wildcard expression, since `.` will not match on
+   those
+
 ### Writing to Bigtable
 
 You can use the `bigtable` format along with specifying the Bigtable
diff --git a/spark-bigtable-core/src/test/scala/com/google/cloud/spark/bigtable/CatalogColumnMappingTest.scala b/spark-bigtable-core/src/test/scala/com/google/cloud/spark/bigtable/CatalogColumnMappingTest.scala
@@ -104,7 +104,7 @@ class CatalogColumnMappingTest
          |"key":{"cf":"rowkey", "col":"row-key", "type":"string"}
          |},
          |"regexColumns":{
-         |"someCol":{"cf":"cf1", "col":".*", "type":"string"}
+         |"someCol":{"cf":"cf1", "pattern":".*", "type":"string"}
          |}
          |}""".stripMargin
 
@@ -136,7 +136,7 @@ class CatalogColumnMappingTest
          |"staticCol":{"cf":"cf1", "col":"any", "type":"string"}
          |},
          |"regexColumns":{
-         |"someCol":{"cf":"cf1", "col":".*", "type":"string"}
+         |"someCol":{"cf":"cf1", "pattern":".*", "type":"string"}
          |}
          |}""".stripMargin
 
@@ -168,7 +168,7 @@ class CatalogColumnMappingTest
          |"key":{"cf":"rowkey", "col":"row-key", "type":"string"}
          |},
          |"regexColumns":{
-         |"someCol":{"cf":"cf1", "col":".*", "type":"string"}
+         |"someCol":{"cf":"cf1", "pattern":".*", "type":"string"}
          |}
          |}""".stripMargin
 
@@ -207,8 +207,8 @@ class CatalogColumnMappingTest
          |"key":{"cf":"rowkey", "col":"row-key", "type":"string"}
          |},
          |"regexColumns":{
-         |"someCol":{"cf":"cf1", "col":".*", "type":"string"},
-         |"anotherCol":{"cf":"cf1", "col":".*", "type":"string"}
+         |"someCol":{"cf":"cf1", "pattern":".*", "type":"string"},
+         |"anotherCol":{"cf":"cf1", "pattern":".*", "type":"string"}
          |}
          |}""".stripMargin
 
@@ -240,7 +240,7 @@ class CatalogColumnMappingTest
          |"key":{"cf":"rowkey", "col":"row-key", "type":"string"}
          |},
          |"regexColumns":{
-         |"someCol":{"cf":"cf1", "col":"^a.*", "type":"string"}
+         |"someCol":{"cf":"cf1", "pattern":"^a.*", "type":"string"}
          |}
          |}""".stripMargin
 
@@ -279,7 +279,7 @@ class CatalogColumnMappingTest
          |"key":{"cf":"rowkey", "col":"row-key", "type":"string"}
          |},
          |"regexColumns":{
-         |"someCol":{"cf":"cf1", "col":".*", "type":"string"}
+         |"someCol":{"cf":"cf1", "pattern":".*", "type":"string"}
          |}
          |}""".stripMargin
 
@@ -318,7 +318,7 @@ class CatalogColumnMappingTest
          |"key":{"cf":"rowkey", "col":"row-key", "type":"string"}
          |},
          |"regexColumns":{
-         |"someCol":{"cf":"cf1", "col":"^a.*", "type":"string"}
+         |"someCol":{"cf":"cf1", "pattern":"^a.*", "type":"string"}
          |}
          |}""".stripMargin
 
@@ -351,7 +351,7 @@ class CatalogColumnMappingTest
          |"repeated-col":{"cf":"cf1", "col":"any", "type":"string"}
          |},
          |"regexColumns":{
-         |"repeated-col":{"cf":"cf2", "col":".*", "type":"string"}
+         |"repeated-col":{"cf":"cf2", "pattern":".*", "type":"string"}
          |}
          |}""".stripMargin
 
@@ -377,7 +377,7 @@ class CatalogColumnMappingTest
          |"key":{"cf":"rowkey", "col":"row-key", "type":"string"}
          |},
          |"regexColumns":{
-         |"someCol":{"cf":"cf1", "col":"a\\va", "type":"string"}
+         |"someCol":{"cf":"cf1", "pattern":"a\\va", "type":"string"}
          |}
          |}""".stripMargin
 
@@ -420,7 +420,7 @@ class CatalogColumnMappingTest
          |"key":{"cf":"rowkey", "col":"row-key", "type":"string"}
          |},
          |"regexColumns":{
-         |"someCol":{"cf":"cf1", "col":"\\X", "type":"string"}
+         |"someCol":{"cf":"cf1", "pattern":"\\X", "type":"string"}
          |}
          |}""".stripMargin
 
@@ -462,4 +462,4 @@ class CatalogColumnMappingTest
         .setRowKey(ByteString.copyFrom(BytesConverter.toBytes(row)))
         .build())
   }
-}
+}
diff --git a/third_party/hbase-spark-connector/hbase-connectors/src/main/scala/com/google/cloud/spark/bigtable/datasources/BigtableTableCatalog.scala b/third_party/hbase-spark-connector/hbase-connectors/src/main/scala/com/google/cloud/spark/bigtable/datasources/BigtableTableCatalog.scala
@@ -268,6 +268,7 @@ object BigtableTableCatalog {
   val delimiter: Byte = 0
   val length = "length"
   val regexColumns = "regexColumns"
+  val pattern = "pattern"
 
   /** User provide table schema definition
     * {"tablename":"name", "rowkey":"key1:key2",
@@ -309,7 +310,7 @@ object BigtableTableCatalog {
       val f = Field(
         name,
         column(cf),
-        column(col),
+        column(pattern),
         column.get(`type`),
         sAvro,
         len