Skip to content

Commit 0ad6ba4

Browse files
authored
šŸ›Computational backend: re-enable service resources override (#4970)
1 parent 55be1a7 commit 0ad6ba4

File tree

12 files changed

+392
-46
lines changed

12 files changed

+392
-46
lines changed

ā€Žpackages/models-library/src/models_library/api_schemas_resource_usage_tracker/pricing_plans.pyā€Ž

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import Any, ClassVar
44

55
from models_library.resource_tracker import (
6+
HardwareInfo,
67
PricingPlanClassification,
78
PricingPlanId,
89
PricingUnitCostId,
@@ -18,7 +19,7 @@ class PricingUnitGet(BaseModel):
1819
current_cost_per_unit: Decimal
1920
current_cost_per_unit_id: PricingUnitCostId
2021
default: bool
21-
specific_info: dict
22+
specific_info: HardwareInfo
2223

2324
class Config:
2425
schema_extra: ClassVar[dict[str, Any]] = {
@@ -30,8 +31,9 @@ class Config:
3031
"current_cost_per_unit": 5.7,
3132
"current_cost_per_unit_id": 1,
3233
"default": True,
33-
"specific_info": {},
34+
"specific_info": hw_config_example,
3435
}
36+
for hw_config_example in HardwareInfo.Config.schema_extra["examples"]
3537
]
3638
}
3739

@@ -55,9 +57,10 @@ class Config:
5557
"classification": "TIER",
5658
"created_at": "2023-01-11 13:11:47.293595",
5759
"pricing_plan_key": "pricing-plan-sleeper",
58-
"pricing_units": [
59-
PricingUnitGet.Config.schema_extra["examples"][0]
60-
],
60+
"pricing_units": [pricing_unit_get_example],
6161
}
62+
for pricing_unit_get_example in PricingUnitGet.Config.schema_extra[
63+
"examples"
64+
]
6265
]
6366
}

ā€Žpackages/models-library/src/models_library/resource_tracker.pyā€Ž

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,10 @@ class HardwareInfo(BaseModel):
5757

5858
class Config:
5959
schema_extra: ClassVar[dict[str, Any]] = {
60-
"examples": [{"aws_ec2_instances": ["c6a.4xlarge"]}]
60+
"examples": [
61+
{"aws_ec2_instances": ["c6a.4xlarge"]},
62+
{"aws_ec2_instances": []},
63+
]
6164
}
6265

6366

ā€Žpackages/models-library/src/models_library/wallets.pyā€Ž

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,9 @@ class Config:
2727

2828
ZERO_CREDITS = Decimal(0)
2929

30-
31-
### DB
30+
#
31+
# DB
32+
#
3233

3334

3435
class WalletDB(BaseModel):

ā€Žservices/director-v2/src/simcore_service_director_v2/api/routes/computations.pyā€Ž

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
ClusterNotFoundError,
5050
ClustersKeeperNotAvailableError,
5151
ComputationalRunNotFoundError,
52+
ConfigurationError,
5253
PricingPlanUnitNotFoundError,
5354
ProjectNotFoundError,
5455
SchedulerError,
@@ -345,6 +346,8 @@ async def create_computation( # noqa: PLR0913
345346
raise HTTPException(
346347
status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail=f"{e}"
347348
) from e
349+
except ConfigurationError as e:
350+
raise HTTPException(status_code=status.HTTP_409_CONFLICT, detail=f"{e}") from e
348351

349352

350353
@router.get(

ā€Žservices/director-v2/src/simcore_service_director_v2/modules/db/repositories/comp_tasks/_utils.pyā€Ž

Lines changed: 41 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
)
4646
from simcore_postgres_database.utils_projects_nodes import ProjectNodesRepo
4747

48-
from .....core.errors import ClustersKeeperNotAvailableError
48+
from .....core.errors import ClustersKeeperNotAvailableError, ConfigurationError
4949
from .....models.comp_tasks import CompTaskAtDB, Image, NodeSchema
5050
from .....modules.resource_usage_tracker_client import ResourceUsageTrackerClient
5151
from .....utils.comp_scheduler import COMPLETED_STATES
@@ -189,7 +189,8 @@ async def _get_pricing_and_hardware_infos(
189189
node_key: ServiceKey,
190190
node_version: ServiceVersion,
191191
) -> tuple[PricingInfo | None, HardwareInfo]:
192-
if not is_wallet:
192+
if not is_wallet or (to_node_class(node_key) == NodeClass.FRONTEND):
193+
# NOTE: frontend services have no pricing plans, therefore no need to call RUT
193194
return None, HardwareInfo(aws_ec2_instances=[])
194195
project_nodes_repo = ProjectNodesRepo(project_uuid=project_id)
195196
output = await project_nodes_repo.get_project_node_pricing_unit_id(
@@ -204,7 +205,7 @@ async def _get_pricing_and_hardware_infos(
204205
product_name, pricing_plan_id, pricing_unit_id
205206
)
206207
pricing_unit_cost_id = pricing_unit_get.current_cost_per_unit_id
207-
aws_ec2_instances = pricing_unit_get.specific_info["aws_ec2_instances"]
208+
aws_ec2_instances = pricing_unit_get.specific_info.aws_ec2_instances
208209
else:
209210
(
210211
pricing_plan_id,
@@ -231,6 +232,7 @@ async def _get_pricing_and_hardware_infos(
231232

232233

233234
_RAM_SAFE_MARGIN: Final[ByteSize] = parse_obj_as(ByteSize, "1GiB")
235+
_CPUS_SAFE_MARGIN: Final[float] = 0.1
234236

235237

236238
async def _update_project_node_resources_from_hardware_info(
@@ -263,28 +265,41 @@ def _by_type_name(ec2: EC2InstanceType) -> bool:
263265
selected_ec2_instance_type = next(
264266
iter(filter(_by_type_name, unordered_list_ec2_instance_types))
265267
)
268+
266269
# now update the project node required resources
267270
# NOTE: we keep a safe margin with the RAM as the dask-sidecar "sees"
268271
# less memory than the machine theoretical amount
269272
project_nodes_repo = ProjectNodesRepo(project_uuid=project_id)
270273
node = await project_nodes_repo.get(connection, node_id=node_id)
271274
node_resources = parse_obj_as(ServiceResourcesDict, node.required_resources)
272-
assert DEFAULT_SINGLE_SERVICE_NAME in node_resources # nosec
273-
image_resources: ImageResources = node_resources[DEFAULT_SINGLE_SERVICE_NAME]
274-
image_resources.resources["CPU"].set_value(
275-
float(selected_ec2_instance_type.cpus)
276-
)
277-
image_resources.resources["RAM"].set_value(
278-
selected_ec2_instance_type.ram - _RAM_SAFE_MARGIN
279-
)
280-
281-
await project_nodes_repo.update(
282-
connection,
283-
node_id=node_id,
284-
required_resources=ServiceResourcesDictHelpers.create_jsonable(
285-
node_resources
286-
),
275+
if DEFAULT_SINGLE_SERVICE_NAME in node_resources:
276+
image_resources: ImageResources = node_resources[
277+
DEFAULT_SINGLE_SERVICE_NAME
278+
]
279+
image_resources.resources["CPU"].set_value(
280+
float(selected_ec2_instance_type.cpus) - _CPUS_SAFE_MARGIN
281+
)
282+
image_resources.resources["RAM"].set_value(
283+
selected_ec2_instance_type.ram - _RAM_SAFE_MARGIN
284+
)
285+
286+
await project_nodes_repo.update(
287+
connection,
288+
node_id=node_id,
289+
required_resources=ServiceResourcesDictHelpers.create_jsonable(
290+
node_resources
291+
),
292+
)
293+
else:
294+
_logger.warning(
295+
"Services resource override not implemented yet for multi-container services!!!"
296+
)
297+
except StopIteration as exc:
298+
msg = (
299+
f"invalid EC2 type name selected {set(hardware_info.aws_ec2_instances)}."
300+
" TIP: adjust product configuration"
287301
)
302+
raise ConfigurationError(msg) from exc
288303
except (RemoteMethodNotRegisteredError, RPCServerError) as exc:
289304
raise ClustersKeeperNotAvailableError from exc
290305

@@ -357,14 +372,14 @@ async def generate_tasks_list_from_project(
357372
node_version=node.version,
358373
)
359374
assert rabbitmq_rpc_client # nosec
360-
# await _update_project_node_resources_from_hardware_info(
361-
# connection,
362-
# is_wallet=is_wallet,
363-
# project_id=project.uuid,
364-
# node_id=NodeID(node_id),
365-
# hardware_info=hardware_info,
366-
# rabbitmq_rpc_client=rabbitmq_rpc_client,
367-
# )
375+
await _update_project_node_resources_from_hardware_info(
376+
connection,
377+
is_wallet=is_wallet,
378+
project_id=project.uuid,
379+
node_id=NodeID(node_id),
380+
hardware_info=hardware_info,
381+
rabbitmq_rpc_client=rabbitmq_rpc_client,
382+
)
368383

369384
image = await _generate_task_image(
370385
catalog_client=catalog_client,

ā€Žservices/director-v2/src/simcore_service_director_v2/modules/resource_usage_tracker_client.pyā€Ž

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ async def get_default_pricing_and_hardware_info(
107107
service_pricing_plan_get.pricing_plan_id,
108108
unit.pricing_unit_id,
109109
unit.current_cost_per_unit_id,
110-
unit.specific_info["aws_ec2_instances"],
110+
unit.specific_info.aws_ec2_instances,
111111
)
112112
raise PricingPlanUnitNotFoundError(
113113
"Default pricing plan and unit does not exist"

ā€Žservices/director-v2/tests/unit/test_modules_dynamic_sidecar_scheduler.pyā€Ž

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ async def test_scheduler_add_remove(
276276
assert scheduler_data.service_name not in scheduler._scheduler._to_observe
277277

278278

279+
@pytest.mark.flaky(max_runs=3)
279280
async def test_scheduler_removes_partially_started_services(
280281
disabled_scheduler_background_task: None,
281282
manually_trigger_scheduler: Callable[[], Awaitable[None]],

0 commit comments

Comments
Ā (0)