Skip to content

Commit 872eb58

Browse files
scheduling datasets (#535)
Co-authored-by: dnabic-aws <[email protected]>
1 parent fcbff19 commit 872eb58

File tree

11 files changed

+296
-14
lines changed

11 files changed

+296
-14
lines changed

cfn-templates/cid-admin-policies.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,21 @@ Resources:
336336
Resource:
337337
- !Sub arn:aws:quicksight:${AWS::Region}:${AWS::AccountId}:dataset/* # DataSetIDs are dynamic
338338

339+
- Sid: QuickSightDataSetSchedule
340+
Action:
341+
- quicksight:CreateRefreshSchedule
342+
- quicksight:UpdateRefreshSchedule
343+
- quicksight:DeleteRefreshSchedule
344+
- quicksight:DescribeRefreshSchedule
345+
- quicksight:ListRefreshSchedules
346+
- quicksight:CreateDataSetRefreshProperties
347+
- quicksight:DescribeDataSetRefreshProperties
348+
- quicksight:UpdateDataSetRefreshProperties
349+
- quicksight:DeleteDataSetRefreshProperties
350+
Effect: Allow
351+
Resource:
352+
- !Sub arn:aws:quicksight:${AWS::Region}:${AWS::AccountId}:dataset/*/refresh-schedule/* # DataSetIDs are dynamic as well as shcedule ids
353+
339354
- Sid: CreateQueryResultsBucketS3
340355
Action:
341356
- s3:CreateBucket

cfn-templates/cid-cfn.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,7 @@ Parameters:
109109
Description: See https://quicksight.aws.amazon.com/sn/admin#users
110110
QuickSightDataSetRefreshSchedule:
111111
Type: String
112-
MinLength: 3
113-
Default: cron(0 4 * * ? *)
112+
Default: ''
114113
Description: REQUIRED - cron expression on when to refresh spice datasets daily outside of business hours. Default is 4 AM utc, this should work for most customers in US and EU time zones.'
115114
CURBucketPath:
116115
Type: String

cid/builtin/core/data/resources.yaml

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,22 +95,30 @@ datasets:
9595
views:
9696
- summary_view
9797
- ri_sp_mapping
98+
schedules:
99+
- default
98100
ec2_running_cost:
99101
File: cid/ec2_running_cost.json
100102
dependsOn:
101103
views:
102104
- ec2_running_cost
105+
schedules:
106+
- default
103107
compute_savings_plan_eligible_spend:
104108
File: cid/compute.json
105109
dependsOn:
106110
views:
107111
- compute_savings_plan_eligible_spend
112+
schedules:
113+
- default
108114
s3_view:
109115
File: cid/s3_view.json
110116
dependsOn:
111117
views:
112118
- s3_view
113119
- account_map
120+
schedules:
121+
- default
114122

115123
# Shared DataSets
116124
customer_all:
@@ -126,26 +134,36 @@ datasets:
126134
dependsOn:
127135
views:
128136
- kpi_ebs_snap
137+
schedules:
138+
- default
129139
kpi_ebs_storage_all:
130140
File: kpi/kpi_ebs_storage_all.json
131141
dependsOn:
132142
views:
133143
- kpi_ebs_storage_all
144+
schedules:
145+
- default
134146
kpi_instance_all:
135147
File: kpi/kpi_instance_all.json
136148
dependsOn:
137149
views:
138150
- kpi_instance_all
151+
schedules:
152+
- default
139153
kpi_s3_storage_all:
140154
File: kpi/kpi_s3_storage_all.json
141155
dependsOn:
142156
views:
143157
- kpi_s3_storage_all
158+
schedules:
159+
- default
144160
kpi_tracker:
145161
File: kpi/kpi_tracker.json
146162
dependsOn:
147163
views:
148164
- kpi_tracker
165+
schedules:
166+
- default
149167

150168

151169
# Trusted Advisor (TAO)
@@ -154,18 +172,24 @@ datasets:
154172
dependsOn:
155173
views:
156174
- ta_org_view
175+
schedules:
176+
- default
157177

158178
# Trends
159179
daily-anomaly-detection:
160180
File: trends/daily_anomaly_detection.json
161181
dependsOn:
162182
views:
163183
- daily_anomaly_detection
184+
schedules:
185+
- default
164186
monthly-anomaly-detection:
165187
File: trends/monthly_anomaly_detection.json
166188
dependsOn:
167189
views:
168190
- monthly_anomaly_detection
191+
schedules:
192+
- default
169193
monthly-bill-by-account:
170194
spriFile: trends/monthly_bill_by_account_sp_ri.json
171195
spFile: trends/monthly_bill_by_account_sp.json
@@ -174,7 +198,8 @@ datasets:
174198
dependsOn:
175199
views:
176200
- monthly_bill_by_account
177-
201+
schedules:
202+
- default
178203

179204
# Compute Optimiser (CO)
180205
compute_optimizer_all_options:
@@ -183,6 +208,8 @@ datasets:
183208
views:
184209
- compute_optimizer_all_options
185210
- business_units_map
211+
schedules:
212+
- default
186213
parameters:
187214
primary_tag_name:
188215
default: 'application'
@@ -404,3 +431,12 @@ views:
404431
- aws_accounts
405432
ta_descriptions:
406433
File: shared/ta_descriptions.sql
434+
435+
# Refresh Schedules for QuickSight DataSets
436+
schedules:
437+
default:
438+
ScheduleId: cid
439+
ScheduleFrequency:
440+
Interval: DAILY
441+
TimeOfTheDay: '02:00-05:00'
442+
RefreshType: FULL_REFRESH

cid/common.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -217,9 +217,9 @@ def getPlugin(self, plugin) -> dict:
217217
def get_definition(self, type: str, name: str=None, id: str=None) -> dict:
218218
""" return resource definition that matches parameters """
219219
res = None
220-
if type not in ['dashboard', 'dataset', 'view']:
220+
if type not in ['dashboard', 'dataset', 'view', 'schedule']:
221221
raise ValueError(f'{type} is not a valid definition type')
222-
if type in ['dataset', 'view'] and name:
222+
if type in ['dataset', 'view', 'schedule'] and name:
223223
res = self.resources.get(f'{type}s').get(name)
224224
elif type in ['dashboard']:
225225
for definition in self.resources.get(f'{type}s').values():
@@ -275,7 +275,7 @@ def load_resources(self):
275275
'''
276276
if get_parameters().get('resources'):
277277
source = get_parameters().get('resources')
278-
logging.info(f'Loading resources from {source}')
278+
logger.info(f'Loading resources from {source}')
279279
resources = {}
280280
try:
281281
if source.startswith('https://'):
@@ -1378,11 +1378,21 @@ def create_or_update_dataset(self, dataset_definition: dict, dataset_id: str=Non
13781378
break
13791379
if update_dataset:
13801380
self.qs.update_dataset(compiled_dataset)
1381+
if compiled_dataset.get("ImportMode") == "SPICE":
1382+
dataset_id = compiled_dataset.get('DataSetId')
1383+
schedules_definitions = []
1384+
for schedule_name in dataset_definition.get('schedules', []):
1385+
schedules_definitions.append(self.get_definition("schedule", name=schedule_name))
1386+
self.qs.ensure_dataset_refresh_schedule(dataset_id, schedules_definitions)
13811387
else:
13821388
print(f'No update requested for dataset {compiled_dataset.get("DataSetId")} {compiled_dataset.get("Name")}={found_dataset.name} ')
13831389
else:
1384-
self.qs.create_dataset(compiled_dataset)
1385-
1390+
dataset_id = self.qs.create_dataset(compiled_dataset)
1391+
if dataset_id and compiled_dataset.get("ImportMode") == "SPICE":
1392+
schedules_definitions = []
1393+
for schedule_name in dataset_definition.get('schedules', []):
1394+
schedules_definitions.append(self.get_definition("schedule", name=schedule_name))
1395+
self.qs.ensure_dataset_refresh_schedule(dataset_id, schedules_definitions)
13861396
return True
13871397

13881398

cid/export.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ def export_analysis(qs, athena):
160160
datasets[dataset_name] = {
161161
'data': dataset_data,
162162
'dependsOn': {'views': dependancy_views},
163+
'schedules': ['default'], #FIXME: need to read a real schedule
163164
}
164165
if dep_cur:
165166
datasets[dataset_name]['dependsOn']['cur'] = True

cid/helpers/quicksight/__init__.py

Lines changed: 119 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010
import click
1111

1212
from cid.base import CidBase
13-
from cid.helpers import diff
13+
from cid.helpers import diff, timezone, randtime
1414
from cid.helpers.quicksight.dashboard import Dashboard
1515
from cid.helpers.quicksight.dataset import Dataset
1616
from cid.helpers.quicksight.datasource import Datasource
1717
from cid.helpers.quicksight.template import Template as CidQsTemplate
18-
from cid.utils import get_parameter, get_parameters
18+
from cid.utils import get_parameter, get_parameters, exec_env, cid_print
1919
from cid.exceptions import CidCritical, CidError
2020

2121
logger = logging.getLogger(__name__)
@@ -1034,6 +1034,123 @@ def update_dataset(self, definition: dict) -> Dataset:
10341034
return self.describe_dataset(dataset_id)
10351035

10361036

1037+
def get_dataset_refresh_schedules(self, dataset_id):
1038+
"""Returns refresh schedules for given dataset id"""
1039+
try:
1040+
refresh_schedules = self.client.list_refresh_schedules(
1041+
AwsAccountId=self.account_id,
1042+
DataSetId=dataset_id
1043+
)
1044+
return refresh_schedules.get("RefreshSchedules")
1045+
except self.client.exceptions.ResourceNotFoundException as exc:
1046+
raise CidError(f'DataSource {dataset_id} does not exist') from exc
1047+
except self.client.exceptions.AccessDeniedException as exc:
1048+
raise CidError(f'No quicksight:ListDataSets permission') from exc
1049+
except Exception as exc:
1050+
raise CidError(f'Unable to list refresh schedules for dataset {dataset_id}: {str(exc)}') from exc
1051+
1052+
1053+
def ensure_dataset_refresh_schedule(self, dataset_id, schedules: list):
1054+
""" Ensures that dataset has scheduled refresh """
1055+
# get all existing schedules for the given dataset
1056+
try:
1057+
existing_schedules = self.get_dataset_refresh_schedules(dataset_id)
1058+
except CidError as exc:
1059+
logger.debug(exc, exc_info=True)
1060+
logger.warning(
1061+
f'Cannot read dataset schedules for dataset = {dataset_id}. {str(exc)}. Skipping schedule management.'
1062+
' Please make sure scheduled refresh is configured manualy.'
1063+
)
1064+
return
1065+
1066+
if schedules:
1067+
if exec_env()['terminal'] in ('lambda'):
1068+
schedule_frequency_timezone = get_parameters().get("timezone", timezone.get_default_timezone())
1069+
else:
1070+
schedule_frequency_timezone = get_parameter("timezone",
1071+
message='Please select timezone for datasets scheduled refresh time',
1072+
choices=timezone.get_all_timezones(),
1073+
default=timezone.get_default_timezone()
1074+
)
1075+
1076+
for schedule in schedules:
1077+
1078+
# Get the list of exising schedules with the same id
1079+
existing_schedule = None
1080+
for existing in existing_schedules:
1081+
if schedule["ScheduleId"] == existing["ScheduleId"]:
1082+
existing_schedule = existing
1083+
break
1084+
1085+
# Verify that all schedule parameters are set
1086+
schedule["ScheduleId"] = schedule.get("ScheduleId", "cid")
1087+
if "ScheduleFrequency" not in schedule:
1088+
schedule["ScheduleFrequency"] = {}
1089+
schedule["ScheduleFrequency"]["Timezone"] = schedule_frequency_timezone
1090+
try:
1091+
schedule["ScheduleFrequency"]["TimeOfTheDay"] = randtime.get_random_time_from_range(
1092+
self.account_id + dataset_id,
1093+
schedule["ScheduleFrequency"].get("TimeOfTheDay", "")
1094+
)
1095+
except Exception as exc:
1096+
logger.error(
1097+
f'Invalid timerange for schedule with id "{schedule["ScheduleId"]}"'
1098+
f' and dataset {dataset_id}: {str(exc)} ... skipping.'
1099+
f' Please create dataset refresh schedule manually.'
1100+
)
1101+
continue
1102+
schedule["ScheduleFrequency"]["Interval"] = schedule["ScheduleFrequency"].get("Interval", "DAILY")
1103+
schedule["RefreshType"] = schedule.get("RefreshType", "FULL_REFRESH")
1104+
if "providedBy" in schedule:
1105+
del schedule["providedBy"]
1106+
1107+
if not existing_schedule:
1108+
# Avoid adding a new schedule when customer already has put a schedule manually as this can lead to additional charges.
1109+
schedules_with_different_id = [existing for existing in existing_schedules if schedule["ScheduleId"] != existing["ScheduleId"] ]
1110+
if schedules_with_different_id:
1111+
logger.info(
1112+
f'Found the same schedule {schedule.get("RefreshType")} / {schedule["ScheduleFrequency"].get("Interval")}'
1113+
f' but with different id. Skipping to avoid duplication. Please delete all manually created schedules for dataset {dataset_id}'
1114+
)
1115+
continue
1116+
logger.debug(f'Creating refresh schedule with id {schedule["ScheduleId"]} for dataset {dataset_id}.')
1117+
try:
1118+
self.client.create_refresh_schedule(
1119+
DataSetId=dataset_id,
1120+
AwsAccountId=self.account_id,
1121+
Schedule=schedule
1122+
)
1123+
logger.debug(f'Refresh schedule with id {schedule["ScheduleId"]} for dataset {dataset_id} is created.')
1124+
except self.client.exceptions.ResourceNotFoundException:
1125+
logger.error(f'Unable to create refresh schedule with id {schedule["ScheduleId"]}. Dataset {dataset_id} does not exist.')
1126+
except self.client.exceptions.AccessDeniedException:
1127+
logger.error(f'Unable to create refresh schedule with id {schedule["ScheduleId"]}. Please add quicksight:CreateDataSet permission.')
1128+
except Exception as exc:
1129+
logger.error(f'Unable to create refresh schedule with id {schedule["ScheduleId"]} for dataset "{dataset_id}": {str(exc)}')
1130+
else:
1131+
# schedule exists so we need to update
1132+
logger.debug(f'Updating refresh schedule with id {schedule["ScheduleId"]} for dataset {dataset_id}.')
1133+
try:
1134+
self.client.update_refresh_schedule(
1135+
DataSetId=dataset_id,
1136+
AwsAccountId=self.account_id,
1137+
Schedule=schedule
1138+
)
1139+
logger.debug(f'Refresh schedule with id {schedule["ScheduleId"]} for dataset {dataset_id} is updated.')
1140+
except self.client.exceptions.ResourceNotFoundException:
1141+
logger.error(f'Unable to update refresh schedule with id {schedule["ScheduleId"]}. Dataset {dataset_id} does not exist.')
1142+
except self.client.exceptions.AccessDeniedException:
1143+
logger.error(f'Unable to update refresh schedule with id {schedule["ScheduleId"]}. Please add quicksight:UpdqteDataSet permission.')
1144+
except Exception as exc:
1145+
logger.error(f'Unable to update refresh schedule with id {schedule["ScheduleId"]} for dataset "{dataset_id}": {str(exc)}')
1146+
1147+
# Verify if there is at least one schedule and warn user if not
1148+
try:
1149+
if not self.get_dataset_refresh_schedules(dataset_id):
1150+
logger.warning(f'There is no refresh schedule for dataset "{dataset_id}". Please create a refresh schedule manually.' )
1151+
except CidError:
1152+
pass
1153+
10371154
def create_dashboard(self, definition: dict) -> Dashboard:
10381155
""" Creates an AWS QuickSight dashboard """
10391156

cid/helpers/randtime.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
''' Helper functions for dataset schedules
2+
'''
3+
import hashlib
4+
from datetime import datetime, timedelta
5+
6+
def pseudo_random_generator(hashable_string: str, maximum: int=100) -> int:
7+
"""Gernerate a pseudo random integer number, but the same for any given hashable_string identifier """
8+
hash_hex = hashlib.md5(bytes(hashable_string, "utf-8"), usedforsecurity=False).hexdigest()[:16] # nosec B303 - not used for security
9+
bigint_value = int.from_bytes(bytes.fromhex(hash_hex), 'little', signed=True)
10+
return bigint_value % int(maximum)
11+
12+
def get_random_time_from_range(hashable_string, time_range):
13+
""" Generate a random time from a given range
14+
In case that input time is in format hh:mm, just return it back.
15+
When input time is time range hh:mm-hh:mm, then return random time in format hh:mm within provided time range
16+
"""
17+
items = time_range.strip().split('-')
18+
19+
if len(items) == 1:
20+
try:
21+
return datetime.strptime(time_range.strip(), '%H:%M').strftime('%H:%M')
22+
except Exception as exc:
23+
raise ValueError(f'Invalid time range "{time_range}": {str(exc)}') from exc
24+
elif len(items) == 2:
25+
try:
26+
time_from = datetime.strptime(items[0].strip(), '%H:%M')
27+
time_to = datetime.strptime(items[1].strip(), '%H:%M')
28+
if time_to < time_from:
29+
time_to += timedelta(days=1)
30+
time_diff_sec = (time_to - time_from).total_seconds()
31+
return (time_from + timedelta(seconds=pseudo_random_generator(hashable_string, time_diff_sec))).strftime('%H:%M')
32+
except Exception as exc:
33+
raise ValueError(f'Invalid time range "{time_range}": {str(exc)}') from exc
34+
else:
35+
raise ValueError(f'Invalid time range "{time_range}". Please provide timerange in format hh:mm or hh:mm-hh:mm')

0 commit comments

Comments
 (0)