Skip to content

Commit a65151f

Browse files
authored
Fix CopyIncrementally with no data (#54)
* Fix imports and ensure the IDE knows about types * Use more descriptive messages in copy task * Handle case when the source table is empty We have that for partitioned tables where the time based query does not find any data for several hours.
1 parent 66e7dc1 commit a65151f

File tree

1 file changed

+18
-10
lines changed

1 file changed

+18
-10
lines changed

mara_pipelines/commands/sql.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import mara_db.dbs
1010
import mara_db.shell
11+
import mara_db.postgresql
1112
from mara_page import _, html
1213
from .. import config, shell, pipelines
1314
from ..incremental_processing import file_dependencies
@@ -43,6 +44,7 @@ def sql_file_path(self) -> pathlib.Path:
4344
pipeline_candidate = self
4445
while not isinstance(pipeline_candidate, pipelines.Pipeline):
4546
pipeline_candidate = pipeline_candidate.parent
47+
assert isinstance(pipeline_candidate, pipelines.Pipeline)
4648
return pipeline_candidate.base_path() / self.sql_file_name
4749

4850
def shell_command(self):
@@ -163,7 +165,7 @@ def target_db_alias(self):
163165
return self._target_db_alias or config.default_db_alias()
164166

165167
def file_path(self) -> pathlib.Path:
166-
return self.parent.parent.base_path() / self.file_name
168+
return self.parent.parent.base_path() / self.sql_file_name
167169

168170
def run(self) -> bool:
169171
if self.sql_file_name:
@@ -185,7 +187,6 @@ def run(self) -> bool:
185187
# (see also above in ExecuteSQL)
186188
file_dependencies.delete(self.node_path(), dependency_type)
187189

188-
189190
if not super().run():
190191
return False
191192

@@ -266,7 +267,7 @@ def run(self) -> bool:
266267
# retrieve the highest current value for the modification comparison (e.g.: the highest timestamp)
267268
# We intentionally use the command line here (rather than sqlalchemy) to avoid forcing people python drivers,
268269
# which can be hard for example in the case of SQL Server
269-
logger.log(f'get highest modification comparison value', format=logger.Format.ITALICS)
270+
logger.log(f'Get new max modification comparison value...', format=logger.Format.ITALICS)
270271
max_value_query = f'SELECT max({self.modification_comparison}) AS maxval FROM {self.source_table}'
271272
logger.log(max_value_query, format=logger.Format.VERBATIM)
272273
result = shell.run_shell_command(f'echo {shlex.quote(max_value_query)} \\\n | '
@@ -275,30 +276,37 @@ def run(self) -> bool:
275276
if not result:
276277
return False
277278

279+
if isinstance(result, bool):
280+
# This happens if the query above ran, but returned no data and therefore the load
281+
# query below would also return no data
282+
# We assume that this happens e.g. when there is no data *yet* and let the load succeed
283+
# without actually doing anything
284+
logger.log("Found no data, not starting Copy.", format=logger.Format.VERBATIM)
285+
return True
278286
# be flexible with different output formats: remove the column header & remove whitespace & quotes
279287
max_modification_value = ''.join(result).replace('maxval', '').strip().strip('"')
280-
logger.log(repr(max_modification_value), format=logger.Format.VERBATIM)
288+
logger.log(f"New max modification comparison value: {max_modification_value!r}", format=logger.Format.VERBATIM)
281289

282290
# check whether target table is empty
283291
target_table_is_empty = True
284292

285293
target_table_empty_query = f'SELECT TRUE FROM {self.target_table} LIMIT 1'
286-
logger.log(f'check if target table is empty', format=logger.Format.ITALICS)
294+
logger.log(f'Check if target table is empty', format=logger.Format.ITALICS)
287295
logger.log(target_table_empty_query, format=logger.Format.VERBATIM)
288296
with mara_db.postgresql.postgres_cursor_context(self.target_db_alias) as cursor:
289297
cursor.execute(f'SELECT TRUE FROM {self.target_table} LIMIT 1')
290298
target_table_is_empty = not cursor.fetchone()
291299
logger.log(f"target table{'' if target_table_is_empty else ' not'} empty", format=logger.Format.ITALICS)
292300

293301
# get last comparison value
294-
logger.log('get last comparison value', format=logger.Format.ITALICS)
302+
logger.log('Get last comparison value...', format=logger.Format.ITALICS)
295303
last_comparison_value = incremental_copy_status.get_last_comparison_value(
296304
self.node_path(), self.source_db_alias, self.source_table)
297-
logger.log(repr(last_comparison_value), format=logger.Format.VERBATIM)
305+
logger.log(f"Last max modification comparison value: {last_comparison_value!r}", format=logger.Format.VERBATIM)
298306

299307
if target_table_is_empty or not last_comparison_value:
300308
# full load
301-
logger.log('full (non incremental) copy', logger.Format.ITALICS)
309+
logger.log('Using full (non incremental) Copy', logger.Format.ITALICS)
302310
if not target_table_is_empty:
303311
truncate_query = f'TRUNCATE TABLE {self.target_table}'
304312
logger.log(truncate_query, format=logger.Format.VERBATIM)
@@ -309,7 +317,7 @@ def run(self) -> bool:
309317
# If we would crash during load (with some data already in the table), the next run would
310318
# not trigger a full load and we would miss data. To prevent that, delete the old
311319
# comparison value (we will then set it only on success)
312-
logger.log('deleting old comparison value', logger.Format.ITALICS)
320+
logger.log('Deleting old comparison value', logger.Format.ITALICS)
313321
incremental_copy_status.delete(self.node_path(), self.source_db_alias, self.source_table)
314322

315323
# overwrite the comparison criteria to get everything
@@ -320,7 +328,7 @@ def run(self) -> bool:
320328

321329
else:
322330
# incremental load. First create the table which will contain the delta
323-
logger.log('incremental copy, create upsert table', logger.Format.ITALICS)
331+
logger.log('Using incremental Copy, create upsert table', logger.Format.ITALICS)
324332
create_upsert_table_query = (f'DROP TABLE IF EXISTS {self.target_table}_upsert;\n'
325333
+ f'CREATE TABLE {self.target_table}_upsert AS SELECT * from {self.target_table} WHERE FALSE')
326334

0 commit comments

Comments
 (0)