Skip to content

Commit b271785

Browse files
authored
Retention-based Partition Dropping (#44)
* Add command: drop, to calculate partition drops based on retention periods * Deduplicate methods that moved into database_helpers * Add database helper tests * Add dropper tests * More test cleanups * Update to PyLint 2.17.7 to fix Python11 * More tests * pytlint needs pytest * Add an assertion for correct ordering of partitions
1 parent fd793fa commit b271785

File tree

11 files changed

+677
-58
lines changed

11 files changed

+677
-58
lines changed

.github/workflows/ci.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,10 @@ jobs:
1717
- name: Install Linting Tools
1818
run: |
1919
python -m pip install --upgrade pip
20-
pip install --user pylint==2.6.0
20+
pip install --user pylint==2.17.7
2121
pip install --user black~=22.3
2222
pip install --user flake8~=4.0
23+
pip install --user pytest
2324
2425
- name: Install Partition Manager
2526
run: |

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ repos:
1717
hooks:
1818
- id: flake8
1919
- repo: https://github.com/PyCQA/pylint
20-
rev: pylint-2.6.0
20+
rev: v2.17.7
2121
hooks:
2222
- id: pylint
2323
args:

partitionmanager/cli.py

Lines changed: 58 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import traceback
1111
import yaml
1212

13+
import partitionmanager.database_helpers
14+
import partitionmanager.dropper
1315
import partitionmanager.migrate
1416
import partitionmanager.sql
1517
import partitionmanager.stats
@@ -121,10 +123,10 @@ def from_yaml_file(self, file):
121123
for key in data["tables"]:
122124
tab = partitionmanager.types.Table(key)
123125
tabledata = data["tables"][key]
124-
if isinstance(tabledata, dict) and "retention" in tabledata:
125-
tab.set_retention(
126+
if isinstance(tabledata, dict) and "retention_period" in tabledata:
127+
tab.set_retention_period(
126128
partitionmanager.types.timedelta_from_dict(
127-
tabledata["retention"]
129+
tabledata["retention_period"]
128130
)
129131
)
130132
if isinstance(tabledata, dict) and "partition_period" in tabledata:
@@ -318,16 +320,10 @@ def do_partition(conf):
318320
duration = table.partition_period
319321

320322
log.info(f"Evaluating {table} (duration={duration})")
321-
322-
positions = pm_tap.get_current_positions(
323-
conf.dbcmd, table, map_data["range_cols"]
323+
cur_pos = partitionmanager.database_helpers.get_position_of_table(
324+
conf.dbcmd, table, map_data
324325
)
325326

326-
log.info(f"{table} (pos={positions})")
327-
328-
cur_pos = partitionmanager.types.Position()
329-
cur_pos.set_position([positions[col] for col in map_data["range_cols"]])
330-
331327
sql_cmds = pm_tap.get_pending_sql_reorganize_partition_commands(
332328
database=conf.dbcmd,
333329
table=table,
@@ -465,6 +461,57 @@ def do_stats(conf, metrics=partitionmanager.stats.PrometheusMetrics()):
465461
return all_results
466462

467463

464+
def drop_cmd(args):
465+
"""Calculates drop.
466+
Helper for argparse.
467+
"""
468+
conf = config_from_args(args)
469+
return do_find_drops_for_tables(conf)
470+
471+
472+
DROP_PARSER = SUBPARSERS.add_parser("drop", help="drop old partitions")
473+
DROP_PARSER.set_defaults(func=drop_cmd)
474+
475+
476+
def do_find_drops_for_tables(conf):
477+
all_results = dict()
478+
for table in conf.tables:
479+
log = logging.getLogger(f"do_find_drops_for_tables:{table.name}")
480+
481+
if not table.has_date_query:
482+
log.warning(f"Cannot process {table}: no date query specified")
483+
continue
484+
485+
if not table.retention_period:
486+
log.warning(f"Cannot process {table}: no retention specified")
487+
continue
488+
489+
try:
490+
table_problems = pm_tap.get_table_compatibility_problems(conf.dbcmd, table)
491+
if table_problems:
492+
log.debug(f"Cannot process {table}: {table_problems}")
493+
continue
494+
495+
map_data = pm_tap.get_partition_map(conf.dbcmd, table)
496+
current_position = partitionmanager.database_helpers.get_position_of_table(
497+
conf.dbcmd, table, map_data
498+
)
499+
500+
droppable = partitionmanager.dropper.get_droppable_partitions(
501+
conf.dbcmd,
502+
map_data["partitions"],
503+
current_position,
504+
conf.curtime,
505+
table,
506+
)
507+
508+
all_results[table.name] = droppable
509+
except Exception as e:
510+
log.warning(f"Error processing table {table.name}")
511+
raise e
512+
return all_results
513+
514+
468515
def main():
469516
"""Start here."""
470517
args = PARSER.parse_args()

partitionmanager/cli_test.py

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
migrate_cmd,
99
config_from_args,
1010
do_partition,
11+
drop_cmd,
1112
PARSER,
1213
partition_cmd,
1314
stats_cmd,
@@ -224,11 +225,9 @@ def test_partition_period_seven_days(self):
224225
[
225226
"INFO:partition:Evaluating Table partitioned_last_week "
226227
"(duration=7 days, 0:00:00)",
227-
"INFO:partition:Table partitioned_last_week (pos={'id': 150})",
228228
"DEBUG:partition:Table partitioned_last_week has no pending SQL updates.",
229229
"INFO:partition:Evaluating Table partitioned_yesterday "
230230
"(duration=7 days, 0:00:00)",
231-
"INFO:partition:Table partitioned_yesterday (pos={'id': 150})",
232231
"DEBUG:partition:Table partitioned_yesterday has no pending SQL updates.",
233232
]
234233
),
@@ -626,3 +625,60 @@ def test_migrate_cmd_in_out(self):
626625
"flip",
627626
]
628627
)
628+
629+
630+
class TestDropCmd(unittest.TestCase):
631+
def _run_drop_cmd_yaml(self, yaml):
632+
with tempfile.NamedTemporaryFile() as tmpfile:
633+
insert_into_file(tmpfile, yaml)
634+
args = PARSER.parse_args(["--config", tmpfile.name, "drop"])
635+
return drop_cmd(args)
636+
637+
def test_drop_invalid_config(self):
638+
with self.assertLogs(
639+
"do_find_drops_for_tables:unused", level="WARNING"
640+
) as logctx:
641+
self._run_drop_cmd_yaml(
642+
f"""
643+
partitionmanager:
644+
mariadb: {str(fake_exec)}
645+
tables:
646+
unused:
647+
earliest_utc_timestamp_query: >
648+
SELECT UNIX_TIMESTAMP(`issued`) FROM `unused`
649+
WHERE `id` > '?' ORDER BY `id` ASC LIMIT 1;
650+
"""
651+
)
652+
self.assertEqual(
653+
set(logctx.output),
654+
set(
655+
[
656+
"WARNING:do_find_drops_for_tables:unused:"
657+
"Cannot process Table unused: no retention specified"
658+
]
659+
),
660+
)
661+
662+
def test_drop_no_sql(self):
663+
with self.assertLogs(
664+
"do_find_drops_for_tables:unused", level="WARNING"
665+
) as logctx:
666+
self._run_drop_cmd_yaml(
667+
f"""
668+
partitionmanager:
669+
mariadb: {str(fake_exec)}
670+
tables:
671+
unused:
672+
retention_period:
673+
days: 180
674+
"""
675+
)
676+
self.assertEqual(
677+
set(logctx.output),
678+
set(
679+
[
680+
"WARNING:do_find_drops_for_tables:unused:"
681+
"Cannot process Table unused: no date query specified"
682+
]
683+
),
684+
)
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
"""
2+
Helper functions for database operations
3+
"""
4+
5+
from datetime import datetime, timezone
6+
import logging
7+
8+
import partitionmanager.table_append_partition as pm_tap
9+
import partitionmanager.types
10+
11+
12+
def get_position_of_table(database, table, map_data):
13+
"""Returns a Position of the table at the current moment."""
14+
15+
pos_list = pm_tap.get_current_positions(database, table, map_data["range_cols"])
16+
17+
cur_pos = partitionmanager.types.Position()
18+
cur_pos.set_position([pos_list[col] for col in map_data["range_cols"]])
19+
20+
return cur_pos
21+
22+
23+
def calculate_exact_timestamp_via_query(database, table, position_partition):
24+
"""Calculates the exact timestamp of a PositionPartition.
25+
26+
raises ValueError if the position is incalculable
27+
"""
28+
29+
log = logging.getLogger(f"calculate_exact_timestamp_via_query:{table.name}")
30+
31+
if not table.has_date_query:
32+
raise ValueError("Table has no defined date query")
33+
34+
if not isinstance(position_partition, partitionmanager.types.PositionPartition):
35+
raise ValueError("Only PositionPartitions are supported")
36+
37+
if len(position_partition.position) != 1:
38+
raise ValueError(
39+
"This method is only valid for single-column partitions right now"
40+
)
41+
arg = position_partition.position.as_sql_input()[0]
42+
43+
sql_select_cmd = table.earliest_utc_timestamp_query.get_statement_with_argument(arg)
44+
log.debug(
45+
"Executing %s to derive partition %s at position %s",
46+
sql_select_cmd,
47+
position_partition.name,
48+
position_partition.position,
49+
)
50+
51+
start = datetime.now()
52+
exact_time_result = database.run(sql_select_cmd)
53+
end = datetime.now()
54+
55+
if not len(exact_time_result) == 1:
56+
raise partitionmanager.types.NoExactTimeException("No exact timestamp result")
57+
if not len(exact_time_result[0]) == 1:
58+
raise partitionmanager.types.NoExactTimeException(
59+
"Unexpected column count for the timestamp result"
60+
)
61+
for key, value in exact_time_result[0].items():
62+
exact_time = datetime.fromtimestamp(value, tz=timezone.utc)
63+
break
64+
65+
log.debug(
66+
"Exact time of %s returned for %s at position %s, query took %s",
67+
exact_time,
68+
position_partition.name,
69+
position_partition.position,
70+
(end - start),
71+
)
72+
return exact_time
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import unittest
2+
3+
from .database_helpers import get_position_of_table, calculate_exact_timestamp_via_query
4+
5+
from .types import (
6+
DatabaseCommand,
7+
NoExactTimeException,
8+
PositionPartition,
9+
SqlInput,
10+
SqlQuery,
11+
Table,
12+
)
13+
14+
15+
class MockDatabase(DatabaseCommand):
16+
def __init__(self):
17+
self._responses = list()
18+
self.num_queries = 0
19+
20+
def add_response(self, expected, response):
21+
self._responses.insert(0, {"expected": expected, "response": response})
22+
23+
def run(self, cmd):
24+
self.num_queries += 1
25+
if not self._responses:
26+
raise Exception(f"No mock responses available for cmd [{cmd}]")
27+
28+
r = self._responses.pop()
29+
if r["expected"] in cmd:
30+
return r["response"]
31+
32+
raise Exception(f"Received command [{cmd}] and expected [{r['expected']}]")
33+
34+
def db_name(self):
35+
return SqlInput("the-database")
36+
37+
38+
class TestDatabaseHelpers(unittest.TestCase):
39+
def test_position_of_table(self):
40+
db = MockDatabase()
41+
db.add_response("SELECT id FROM `burgers` ORDER BY", [{"id": 90210}])
42+
43+
table = Table("burgers")
44+
data = {"range_cols": ["id"]}
45+
46+
pos = get_position_of_table(db, table, data)
47+
self.assertEqual(pos.as_list(), [90210])
48+
49+
def test_exact_timestamp_no_query(self):
50+
db = MockDatabase()
51+
db.add_response("SELECT id FROM `burgers` ORDER BY", [{"id": 42}])
52+
53+
table = Table("burgers")
54+
self.assertFalse(table.has_date_query)
55+
56+
pos = PositionPartition("p_start")
57+
pos.set_position([42])
58+
59+
with self.assertRaises(ValueError):
60+
calculate_exact_timestamp_via_query(db, table, pos)
61+
62+
def test_exact_timestamp(self):
63+
db = MockDatabase()
64+
db.add_response(
65+
"SELECT UNIX_TIMESTAMP(`cooked`)", [{"UNIX_TIMESTAMP": 17541339060}]
66+
)
67+
68+
table = Table("burgers")
69+
table.set_earliest_utc_timestamp_query(
70+
SqlQuery(
71+
"SELECT UNIX_TIMESTAMP(`cooked`) FROM `orders` "
72+
"WHERE `type` = \"burger\" AND `id` > '?' ORDER BY `id` ASC LIMIT 1;"
73+
)
74+
)
75+
76+
pos = PositionPartition("p_start")
77+
pos.set_position([150])
78+
79+
ts = calculate_exact_timestamp_via_query(db, table, pos)
80+
assert f"{ts}" == "2525-11-11 18:11:00+00:00"
81+
82+
def test_no_exact_timestamp(self):
83+
db = MockDatabase()
84+
db.add_response(
85+
"SELECT UNIX_TIMESTAMP(`cooked`)",
86+
[{"UNIX_TIMESTAMP": 17541339060}, {"UNIX_TIMESTAMP": 17541339070}],
87+
)
88+
89+
table = Table("burgers")
90+
table.set_earliest_utc_timestamp_query(
91+
SqlQuery(
92+
"SELECT UNIX_TIMESTAMP(`cooked`) FROM `orders` "
93+
"WHERE `type` = \"burger\" AND `id` > '?' ORDER BY `id` ASC LIMIT 1;"
94+
)
95+
)
96+
97+
pos = PositionPartition("p_start")
98+
pos.set_position([150])
99+
100+
with self.assertRaises(NoExactTimeException):
101+
calculate_exact_timestamp_via_query(db, table, pos)
102+
103+
db.add_response(
104+
"SELECT UNIX_TIMESTAMP(`cooked`)",
105+
[{"UNIX_TIMESTAMP": 17541339060, "column2": True}],
106+
)
107+
108+
with self.assertRaises(NoExactTimeException):
109+
calculate_exact_timestamp_via_query(db, table, pos)

0 commit comments

Comments
 (0)