Skip to content

Commit 07442cc

Browse files
authored
[Bug Fix] Allow HiveCatalog to create table with TimestamptzType (#585)
1 parent 1016b19 commit 07442cc

File tree

7 files changed

+216
-31
lines changed

7 files changed

+216
-31
lines changed

mkdocs/docs/configuration.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,15 @@ catalog:
232232
s3.secret-access-key: password
233233
```
234234

235+
When using Hive 2.x, make sure to set the compatibility flag:
236+
237+
```yaml
238+
catalog:
239+
default:
240+
...
241+
hive.hive2-compatible: true
242+
```
243+
235244
## Glue Catalog
236245

237246
Your AWS credentials can be passed directly through the Python API.

pyiceberg/catalog/glue.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
from pyiceberg.table import (
6666
CommitTableRequest,
6767
CommitTableResponse,
68+
PropertyUtil,
6869
Table,
6970
update_table_metadata,
7071
)
@@ -162,7 +163,7 @@ def primitive(self, primitive: PrimitiveType) -> str:
162163
if isinstance(primitive, DecimalType):
163164
return f"decimal({primitive.precision},{primitive.scale})"
164165
if (primitive_type := type(primitive)) not in GLUE_PRIMITIVE_TYPES:
165-
return str(primitive_type.root)
166+
return str(primitive)
166167
return GLUE_PRIMITIVE_TYPES[primitive_type]
167168

168169

@@ -344,7 +345,7 @@ def _update_glue_table(self, database_name: str, table_name: str, table_input: T
344345
self.glue.update_table(
345346
DatabaseName=database_name,
346347
TableInput=table_input,
347-
SkipArchive=self.properties.get(GLUE_SKIP_ARCHIVE, GLUE_SKIP_ARCHIVE_DEFAULT),
348+
SkipArchive=PropertyUtil.property_as_bool(self.properties, GLUE_SKIP_ARCHIVE, GLUE_SKIP_ARCHIVE_DEFAULT),
348349
VersionId=version_id,
349350
)
350351
except self.glue.exceptions.EntityNotFoundException as e:

pyiceberg/catalog/hive.py

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@
7474
from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
7575
from pyiceberg.schema import Schema, SchemaVisitor, visit
7676
from pyiceberg.serializers import FromInputFile
77-
from pyiceberg.table import CommitTableRequest, CommitTableResponse, Table, TableProperties, update_table_metadata
77+
from pyiceberg.table import CommitTableRequest, CommitTableResponse, PropertyUtil, Table, TableProperties, update_table_metadata
7878
from pyiceberg.table.metadata import new_table_metadata
7979
from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
8080
from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties
@@ -95,6 +95,7 @@
9595
StringType,
9696
StructType,
9797
TimestampType,
98+
TimestamptzType,
9899
TimeType,
99100
UUIDType,
100101
)
@@ -103,25 +104,13 @@
103104
import pyarrow as pa
104105

105106

106-
# Replace by visitor
107-
hive_types = {
108-
BooleanType: "boolean",
109-
IntegerType: "int",
110-
LongType: "bigint",
111-
FloatType: "float",
112-
DoubleType: "double",
113-
DateType: "date",
114-
TimeType: "string",
115-
TimestampType: "timestamp",
116-
StringType: "string",
117-
UUIDType: "string",
118-
BinaryType: "binary",
119-
FixedType: "binary",
120-
}
121-
122107
COMMENT = "comment"
123108
OWNER = "owner"
124109

110+
# If set to true, HiveCatalog will operate in Hive2 compatibility mode
111+
HIVE2_COMPATIBLE = "hive.hive2-compatible"
112+
HIVE2_COMPATIBLE_DEFAULT = False
113+
125114

126115
class _HiveClient:
127116
"""Helper class to nicely open and close the transport."""
@@ -151,10 +140,15 @@ def __exit__(
151140
self._transport.close()
152141

153142

154-
def _construct_hive_storage_descriptor(schema: Schema, location: Optional[str]) -> StorageDescriptor:
143+
def _construct_hive_storage_descriptor(
144+
schema: Schema, location: Optional[str], hive2_compatible: bool = False
145+
) -> StorageDescriptor:
155146
ser_de_info = SerDeInfo(serializationLib="org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")
156147
return StorageDescriptor(
157-
[FieldSchema(field.name, visit(field.field_type, SchemaToHiveConverter()), field.doc) for field in schema.fields],
148+
[
149+
FieldSchema(field.name, visit(field.field_type, SchemaToHiveConverter(hive2_compatible)), field.doc)
150+
for field in schema.fields
151+
],
158152
location,
159153
"org.apache.hadoop.mapred.FileInputFormat",
160154
"org.apache.hadoop.mapred.FileOutputFormat",
@@ -199,6 +193,7 @@ def _annotate_namespace(database: HiveDatabase, properties: Properties) -> HiveD
199193
DateType: "date",
200194
TimeType: "string",
201195
TimestampType: "timestamp",
196+
TimestamptzType: "timestamp with local time zone",
202197
StringType: "string",
203198
UUIDType: "string",
204199
BinaryType: "binary",
@@ -207,6 +202,11 @@ def _annotate_namespace(database: HiveDatabase, properties: Properties) -> HiveD
207202

208203

209204
class SchemaToHiveConverter(SchemaVisitor[str]):
205+
hive2_compatible: bool
206+
207+
def __init__(self, hive2_compatible: bool):
208+
self.hive2_compatible = hive2_compatible
209+
210210
def schema(self, schema: Schema, struct_result: str) -> str:
211211
return struct_result
212212

@@ -226,6 +226,9 @@ def map(self, map_type: MapType, key_result: str, value_result: str) -> str:
226226
def primitive(self, primitive: PrimitiveType) -> str:
227227
if isinstance(primitive, DecimalType):
228228
return f"decimal({primitive.precision},{primitive.scale})"
229+
elif self.hive2_compatible and isinstance(primitive, TimestamptzType):
230+
# Hive2 doesn't support timestamp with local time zone
231+
return "timestamp"
229232
else:
230233
return HIVE_PRIMITIVE_TYPES[type(primitive)]
231234

@@ -314,7 +317,9 @@ def create_table(
314317
owner=properties[OWNER] if properties and OWNER in properties else getpass.getuser(),
315318
createTime=current_time_millis // 1000,
316319
lastAccessTime=current_time_millis // 1000,
317-
sd=_construct_hive_storage_descriptor(schema, location),
320+
sd=_construct_hive_storage_descriptor(
321+
schema, location, PropertyUtil.property_as_bool(self.properties, HIVE2_COMPATIBLE, HIVE2_COMPATIBLE_DEFAULT)
322+
),
318323
tableType=EXTERNAL_TABLE,
319324
parameters=_construct_parameters(metadata_location),
320325
)

pyiceberg/table/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,12 @@ def property_as_int(properties: Dict[str, str], property_name: str, default: Opt
251251
else:
252252
return default
253253

254+
@staticmethod
255+
def property_as_bool(properties: Dict[str, str], property_name: str, default: bool) -> bool:
256+
if value := properties.get(property_name):
257+
return value.lower() == "true"
258+
return default
259+
254260

255261
class Transaction:
256262
_table: Table

tests/catalog/test_hive.py

Lines changed: 79 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,24 @@
6161
from pyiceberg.transforms import BucketTransform, IdentityTransform
6262
from pyiceberg.typedef import UTF8
6363
from pyiceberg.types import (
64+
BinaryType,
6465
BooleanType,
66+
DateType,
67+
DecimalType,
68+
DoubleType,
69+
FixedType,
70+
FloatType,
6571
IntegerType,
72+
ListType,
6673
LongType,
74+
MapType,
6775
NestedField,
6876
StringType,
77+
StructType,
78+
TimestampType,
79+
TimestamptzType,
80+
TimeType,
81+
UUIDType,
6982
)
7083

7184
HIVE_CATALOG_NAME = "hive"
@@ -181,15 +194,20 @@ def test_check_number_of_namespaces(table_schema_simple: Schema) -> None:
181194
catalog.create_table("table", schema=table_schema_simple)
182195

183196

197+
@pytest.mark.parametrize("hive2_compatible", [True, False])
184198
@patch("time.time", MagicMock(return_value=12345))
185-
def test_create_table(table_schema_simple: Schema, hive_database: HiveDatabase, hive_table: HiveTable) -> None:
199+
def test_create_table(
200+
table_schema_with_all_types: Schema, hive_database: HiveDatabase, hive_table: HiveTable, hive2_compatible: bool
201+
) -> None:
186202
catalog = HiveCatalog(HIVE_CATALOG_NAME, uri=HIVE_METASTORE_FAKE_URL)
203+
if hive2_compatible:
204+
catalog = HiveCatalog(HIVE_CATALOG_NAME, uri=HIVE_METASTORE_FAKE_URL, **{"hive.hive2-compatible": "true"})
187205

188206
catalog._client = MagicMock()
189207
catalog._client.__enter__().create_table.return_value = None
190208
catalog._client.__enter__().get_table.return_value = hive_table
191209
catalog._client.__enter__().get_database.return_value = hive_database
192-
catalog.create_table(("default", "table"), schema=table_schema_simple, properties={"owner": "javaberg"})
210+
catalog.create_table(("default", "table"), schema=table_schema_with_all_types, properties={"owner": "javaberg"})
193211

194212
called_hive_table: HiveTable = catalog._client.__enter__().create_table.call_args[0][0]
195213
# This one is generated within the function itself, so we need to extract
@@ -207,9 +225,27 @@ def test_create_table(table_schema_simple: Schema, hive_database: HiveDatabase,
207225
retention=None,
208226
sd=StorageDescriptor(
209227
cols=[
210-
FieldSchema(name="foo", type="string", comment=None),
211-
FieldSchema(name="bar", type="int", comment=None),
212-
FieldSchema(name="baz", type="boolean", comment=None),
228+
FieldSchema(name='boolean', type='boolean', comment=None),
229+
FieldSchema(name='integer', type='int', comment=None),
230+
FieldSchema(name='long', type='bigint', comment=None),
231+
FieldSchema(name='float', type='float', comment=None),
232+
FieldSchema(name='double', type='double', comment=None),
233+
FieldSchema(name='decimal', type='decimal(32,3)', comment=None),
234+
FieldSchema(name='date', type='date', comment=None),
235+
FieldSchema(name='time', type='string', comment=None),
236+
FieldSchema(name='timestamp', type='timestamp', comment=None),
237+
FieldSchema(
238+
name='timestamptz',
239+
type='timestamp' if hive2_compatible else 'timestamp with local time zone',
240+
comment=None,
241+
),
242+
FieldSchema(name='string', type='string', comment=None),
243+
FieldSchema(name='uuid', type='string', comment=None),
244+
FieldSchema(name='fixed', type='binary', comment=None),
245+
FieldSchema(name='binary', type='binary', comment=None),
246+
FieldSchema(name='list', type='array<string>', comment=None),
247+
FieldSchema(name='map', type='map<string,int>', comment=None),
248+
FieldSchema(name='struct', type='struct<inner_string:string,inner_int:int>', comment=None),
213249
],
214250
location=f"{hive_database.locationUri}/table",
215251
inputFormat="org.apache.hadoop.mapred.FileInputFormat",
@@ -266,12 +302,46 @@ def test_create_table(table_schema_simple: Schema, hive_database: HiveDatabase,
266302
location=metadata.location,
267303
table_uuid=metadata.table_uuid,
268304
last_updated_ms=metadata.last_updated_ms,
269-
last_column_id=3,
305+
last_column_id=22,
270306
schemas=[
271307
Schema(
272-
NestedField(field_id=1, name="foo", field_type=StringType(), required=False),
273-
NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True),
274-
NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False),
308+
NestedField(field_id=1, name='boolean', field_type=BooleanType(), required=True),
309+
NestedField(field_id=2, name='integer', field_type=IntegerType(), required=True),
310+
NestedField(field_id=3, name='long', field_type=LongType(), required=True),
311+
NestedField(field_id=4, name='float', field_type=FloatType(), required=True),
312+
NestedField(field_id=5, name='double', field_type=DoubleType(), required=True),
313+
NestedField(field_id=6, name='decimal', field_type=DecimalType(precision=32, scale=3), required=True),
314+
NestedField(field_id=7, name='date', field_type=DateType(), required=True),
315+
NestedField(field_id=8, name='time', field_type=TimeType(), required=True),
316+
NestedField(field_id=9, name='timestamp', field_type=TimestampType(), required=True),
317+
NestedField(field_id=10, name='timestamptz', field_type=TimestamptzType(), required=True),
318+
NestedField(field_id=11, name='string', field_type=StringType(), required=True),
319+
NestedField(field_id=12, name='uuid', field_type=UUIDType(), required=True),
320+
NestedField(field_id=13, name='fixed', field_type=FixedType(length=12), required=True),
321+
NestedField(field_id=14, name='binary', field_type=BinaryType(), required=True),
322+
NestedField(
323+
field_id=15,
324+
name='list',
325+
field_type=ListType(type='list', element_id=18, element_type=StringType(), element_required=True),
326+
required=True,
327+
),
328+
NestedField(
329+
field_id=16,
330+
name='map',
331+
field_type=MapType(
332+
type='map', key_id=19, key_type=StringType(), value_id=20, value_type=IntegerType(), value_required=True
333+
),
334+
required=True,
335+
),
336+
NestedField(
337+
field_id=17,
338+
name='struct',
339+
field_type=StructType(
340+
NestedField(field_id=21, name='inner_string', field_type=StringType(), required=False),
341+
NestedField(field_id=22, name='inner_int', field_type=IntegerType(), required=True),
342+
),
343+
required=True,
344+
),
275345
schema_id=0,
276346
identifier_field_ids=[2],
277347
)

0 commit comments

Comments
 (0)