99from typing import TYPE_CHECKING , cast , final
1010
1111import pandas as pd
12- import pyarrow as pa
1312import sqlalchemy
1413import ulid
1514from overrides import overrides
4241 from collections .abc import Generator , Iterator
4342 from pathlib import Path
4443
44+ import pyarrow as pa
4545 from sqlalchemy .engine import Connection , Engine
4646 from sqlalchemy .engine .cursor import CursorResult
4747 from sqlalchemy .engine .reflection import Inspector
@@ -545,17 +545,6 @@ def _finalize_batches(
545545 although this is a fairly rare edge case we can ignore in V1.
546546 """
547547 with self ._finalizing_batches (stream_name ) as batches_to_finalize :
548- if not batches_to_finalize :
549- return {}
550-
551- files : list [Path ] = []
552- # Get a list of all files to finalize from all pending batches.
553- for batch_handle in batches_to_finalize .values ():
554- batch_handle = cast (FileWriterBatchHandle , batch_handle )
555- files += batch_handle .files
556- # Use the max batch ID as the batch ID for table names.
557- max_batch_id = max (batches_to_finalize .keys ())
558-
559548 # Make sure the target schema and target table exist.
560549 self ._ensure_schema_exists ()
561550 final_table_name = self ._ensure_final_table_exists (
@@ -567,6 +556,18 @@ def _finalize_batches(
567556 raise_on_error = True ,
568557 )
569558
559+ if not batches_to_finalize :
560+ # If there are no batches to finalize, return after ensuring the table exists.
561+ return {}
562+
563+ files : list [Path ] = []
564+ # Get a list of all files to finalize from all pending batches.
565+ for batch_handle in batches_to_finalize .values ():
566+ batch_handle = cast (FileWriterBatchHandle , batch_handle )
567+ files += batch_handle .files
568+ # Use the max batch ID as the batch ID for table names.
569+ max_batch_id = max (batches_to_finalize .keys ())
570+
570571 temp_table_name = self ._write_files_to_new_table (
571572 files = files ,
572573 stream_name = stream_name ,
@@ -659,27 +660,25 @@ def _write_files_to_new_table(
659660 """
660661 temp_table_name = self ._create_table_for_loading (stream_name , batch_id )
661662 for file_path in files :
662- with pa .parquet .ParquetFile (file_path ) as pf :
663- record_batch = pf .read ()
664- dataframe = record_batch .to_pandas ()
665-
666- # Pandas will auto-create the table if it doesn't exist, which we don't want.
667- if not self ._table_exists (temp_table_name ):
668- raise exc .AirbyteLibInternalError (
669- message = "Table does not exist after creation." ,
670- context = {
671- "temp_table_name" : temp_table_name ,
672- },
673- )
674-
675- dataframe .to_sql (
676- temp_table_name ,
677- self .get_sql_alchemy_url (),
678- schema = self .config .schema_name ,
679- if_exists = "append" ,
680- index = False ,
681- dtype = self ._get_sql_column_definitions (stream_name ),
663+ dataframe = pd .read_json (file_path , lines = True )
664+
665+ # Pandas will auto-create the table if it doesn't exist, which we don't want.
666+ if not self ._table_exists (temp_table_name ):
667+ raise exc .AirbyteLibInternalError (
668+ message = "Table does not exist after creation." ,
669+ context = {
670+ "temp_table_name" : temp_table_name ,
671+ },
682672 )
673+
674+ dataframe .to_sql (
675+ temp_table_name ,
676+ self .get_sql_alchemy_url (),
677+ schema = self .config .schema_name ,
678+ if_exists = "append" ,
679+ index = False ,
680+ dtype = self ._get_sql_column_definitions (stream_name ),
681+ )
683682 return temp_table_name
684683
685684 @final
@@ -959,6 +958,11 @@ def register_source(
959958 This method is called by the source when it is initialized.
960959 """
961960 self ._source_name = source_name
961+ self .file_writer .register_source (
962+ source_name ,
963+ incoming_source_catalog ,
964+ stream_names = stream_names ,
965+ )
962966 self ._ensure_schema_exists ()
963967 super ().register_source (
964968 source_name ,
0 commit comments