Skip to content

Commit 6b36f90

Browse files
chore: Add documentation on how to use regexColumns (#74)
* chore: Add documentation on how to use regexColumns * typo * another typo * Include change for release notes * add pattern key
1 parent fe868c8 commit 6b36f90

File tree

4 files changed

+54
-13
lines changed

4 files changed

+54
-13
lines changed

CHANGES.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
## Next
44

5+
* PR #72: feat: Dynamic column qualifier support for reading from Bigtable
6+
57
## 0.4.0 - 2025-03-10
68

79
* PR #52: feat: Scala Version Upgrade (2.12 -> 2.13).

README-template.md

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,44 @@ columns and the `id` column is used as the row key. Note that you could also
163163
specify *compound* row keys,
164164
which are created by concatenating multiple DataFrame columns together.
165165

166+
#### Catalog with variable column definitions
167+
168+
You can also use `regexColumns` to match multiple columns in the same column
169+
family to a single data frame column. This can be useful in scenarios where
170+
you don't know the exact column qualifiers for your data ahead of time, like
171+
when your column qualifier is partially composed of other pieces of data.
172+
173+
For example this catalog:
174+
```
175+
{
176+
"table": {"name": "t1"},
177+
"rowkey": "id_rowkey",
178+
"columns": {
179+
"id": {"cf": "rowkey", "col": "id_rowkey", "type": "string"},
180+
},
181+
"regexColumns": {
182+
"metadata": {"cf": "info", "pattern": "\C*", "type": "long" }
183+
}
184+
}
185+
```
186+
187+
Would match all columns in the column family "info" and the result would be a
188+
DataFrame column named "metadata", where it's contents would be a Map of String
189+
to Long with the keys being the column qualifiers and the values are the results
190+
in those columns in Bigtable.
191+
192+
A few caveats:
193+
194+
- The values of all matching columns must be deserializable to the type defined
195+
in the catalog. If you expect to need more complex deserialization you can
196+
also define the type as `bytes` and run custom deserialization logic.
197+
- A catalog with regex columns cannot be used for writes.
198+
- Bigtable uses [RE2](https://github.com/google/re2/wiki/Syntax) for it's regex
199+
implementation, which has slight differences from other implementations.
200+
- Because columns may contain arbitrary characters, including new lines, it is
201+
advisable to use `\C` as the wildcard expression, since `.` will not match on
202+
those
203+
166204
### Writing to Bigtable
167205

168206
You can use the `bigtable` format along with specifying the Bigtable

spark-bigtable-core/src/test/scala/com/google/cloud/spark/bigtable/CatalogColumnMappingTest.scala

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ class CatalogColumnMappingTest
104104
|"key":{"cf":"rowkey", "col":"row-key", "type":"string"}
105105
|},
106106
|"regexColumns":{
107-
|"someCol":{"cf":"cf1", "col":".*", "type":"string"}
107+
|"someCol":{"cf":"cf1", "pattern":".*", "type":"string"}
108108
|}
109109
|}""".stripMargin
110110

@@ -136,7 +136,7 @@ class CatalogColumnMappingTest
136136
|"staticCol":{"cf":"cf1", "col":"any", "type":"string"}
137137
|},
138138
|"regexColumns":{
139-
|"someCol":{"cf":"cf1", "col":".*", "type":"string"}
139+
|"someCol":{"cf":"cf1", "pattern":".*", "type":"string"}
140140
|}
141141
|}""".stripMargin
142142

@@ -168,7 +168,7 @@ class CatalogColumnMappingTest
168168
|"key":{"cf":"rowkey", "col":"row-key", "type":"string"}
169169
|},
170170
|"regexColumns":{
171-
|"someCol":{"cf":"cf1", "col":".*", "type":"string"}
171+
|"someCol":{"cf":"cf1", "pattern":".*", "type":"string"}
172172
|}
173173
|}""".stripMargin
174174

@@ -207,8 +207,8 @@ class CatalogColumnMappingTest
207207
|"key":{"cf":"rowkey", "col":"row-key", "type":"string"}
208208
|},
209209
|"regexColumns":{
210-
|"someCol":{"cf":"cf1", "col":".*", "type":"string"},
211-
|"anotherCol":{"cf":"cf1", "col":".*", "type":"string"}
210+
|"someCol":{"cf":"cf1", "pattern":".*", "type":"string"},
211+
|"anotherCol":{"cf":"cf1", "pattern":".*", "type":"string"}
212212
|}
213213
|}""".stripMargin
214214

@@ -240,7 +240,7 @@ class CatalogColumnMappingTest
240240
|"key":{"cf":"rowkey", "col":"row-key", "type":"string"}
241241
|},
242242
|"regexColumns":{
243-
|"someCol":{"cf":"cf1", "col":"^a.*", "type":"string"}
243+
|"someCol":{"cf":"cf1", "pattern":"^a.*", "type":"string"}
244244
|}
245245
|}""".stripMargin
246246

@@ -279,7 +279,7 @@ class CatalogColumnMappingTest
279279
|"key":{"cf":"rowkey", "col":"row-key", "type":"string"}
280280
|},
281281
|"regexColumns":{
282-
|"someCol":{"cf":"cf1", "col":".*", "type":"string"}
282+
|"someCol":{"cf":"cf1", "pattern":".*", "type":"string"}
283283
|}
284284
|}""".stripMargin
285285

@@ -318,7 +318,7 @@ class CatalogColumnMappingTest
318318
|"key":{"cf":"rowkey", "col":"row-key", "type":"string"}
319319
|},
320320
|"regexColumns":{
321-
|"someCol":{"cf":"cf1", "col":"^a.*", "type":"string"}
321+
|"someCol":{"cf":"cf1", "pattern":"^a.*", "type":"string"}
322322
|}
323323
|}""".stripMargin
324324

@@ -351,7 +351,7 @@ class CatalogColumnMappingTest
351351
|"repeated-col":{"cf":"cf1", "col":"any", "type":"string"}
352352
|},
353353
|"regexColumns":{
354-
|"repeated-col":{"cf":"cf2", "col":".*", "type":"string"}
354+
|"repeated-col":{"cf":"cf2", "pattern":".*", "type":"string"}
355355
|}
356356
|}""".stripMargin
357357

@@ -377,7 +377,7 @@ class CatalogColumnMappingTest
377377
|"key":{"cf":"rowkey", "col":"row-key", "type":"string"}
378378
|},
379379
|"regexColumns":{
380-
|"someCol":{"cf":"cf1", "col":"a\\va", "type":"string"}
380+
|"someCol":{"cf":"cf1", "pattern":"a\\va", "type":"string"}
381381
|}
382382
|}""".stripMargin
383383

@@ -420,7 +420,7 @@ class CatalogColumnMappingTest
420420
|"key":{"cf":"rowkey", "col":"row-key", "type":"string"}
421421
|},
422422
|"regexColumns":{
423-
|"someCol":{"cf":"cf1", "col":"\\X", "type":"string"}
423+
|"someCol":{"cf":"cf1", "pattern":"\\X", "type":"string"}
424424
|}
425425
|}""".stripMargin
426426

@@ -462,4 +462,4 @@ class CatalogColumnMappingTest
462462
.setRowKey(ByteString.copyFrom(BytesConverter.toBytes(row)))
463463
.build())
464464
}
465-
}
465+
}

third_party/hbase-spark-connector/hbase-connectors/src/main/scala/com/google/cloud/spark/bigtable/datasources/BigtableTableCatalog.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ object BigtableTableCatalog {
268268
val delimiter: Byte = 0
269269
val length = "length"
270270
val regexColumns = "regexColumns"
271+
val pattern = "pattern"
271272

272273
/** User provide table schema definition
273274
* {"tablename":"name", "rowkey":"key1:key2",
@@ -309,7 +310,7 @@ object BigtableTableCatalog {
309310
val f = Field(
310311
name,
311312
column(cf),
312-
column(col),
313+
column(pattern),
313314
column.get(`type`),
314315
sAvro,
315316
len

0 commit comments

Comments
 (0)