2121
2222import json
2323from collections .abc import Callable , Sequence
24- from typing import TYPE_CHECKING
24+ from typing import TYPE_CHECKING , Literal
2525
2626from airflow .providers .amazon .aws .hooks .dynamodb import DynamoDBHook
2727from airflow .providers .amazon .version_compat import BaseOperator
@@ -53,6 +53,7 @@ class HiveToDynamoDBOperator(BaseOperator):
5353 :param hiveserver2_conn_id: Reference to the
5454 :ref: `Hive Server2 thrift service connection id <howto/connection:hiveserver2>`.
5555 :param aws_conn_id: aws connection
56+ :param df_type: DataFrame type to use ("pandas" or "polars").
5657 """
5758
5859 template_fields : Sequence [str ] = ("sql" ,)
@@ -73,6 +74,7 @@ def __init__(
7374 schema : str = "default" ,
7475 hiveserver2_conn_id : str = "hiveserver2_default" ,
7576 aws_conn_id : str | None = "aws_default" ,
77+ df_type : Literal ["pandas" , "polars" ] = "pandas" ,
7678 ** kwargs ,
7779 ) -> None :
7880 super ().__init__ (** kwargs )
@@ -86,14 +88,15 @@ def __init__(
8688 self .schema = schema
8789 self .hiveserver2_conn_id = hiveserver2_conn_id
8890 self .aws_conn_id = aws_conn_id
91+ self .df_type = df_type
8992
9093 def execute (self , context : Context ):
9194 hive = HiveServer2Hook (hiveserver2_conn_id = self .hiveserver2_conn_id )
9295
9396 self .log .info ("Extracting data from Hive" )
9497 self .log .info (self .sql )
9598
96- data = hive .get_df (self .sql , schema = self .schema , df_type = "pandas" )
99+ data = hive .get_df (self .sql , schema = self .schema , df_type = self . df_type )
97100 dynamodb = DynamoDBHook (
98101 aws_conn_id = self .aws_conn_id ,
99102 table_name = self .table_name ,
@@ -104,7 +107,10 @@ def execute(self, context: Context):
104107 self .log .info ("Inserting rows into dynamodb" )
105108
106109 if self .pre_process is None :
107- dynamodb .write_batch_data (json .loads (data .to_json (orient = "records" )))
110+ if self .df_type == "polars" :
111+ dynamodb .write_batch_data (data .to_dicts ()) # type:ignore[operator]
112+ elif self .df_type == "pandas" :
113+ dynamodb .write_batch_data (json .loads (data .to_json (orient = "records" ))) # type:ignore[union-attr]
108114 else :
109115 dynamodb .write_batch_data (
110116 self .pre_process (data = data , args = self .pre_process_args , kwargs = self .pre_process_kwargs )
0 commit comments