Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Commit 3832a60

Browse files
committed
Convert max_tasks_per_node to task_slots_per_node
1 parent 9429d14 commit 3832a60

File tree

39 files changed

+141
-87
lines changed

39 files changed

+141
-87
lines changed

config_templates/pool.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ pool_specification:
1717
vm_count:
1818
dedicated: 4
1919
low_priority: 8
20-
max_tasks_per_node: 1
20+
task_slots_per_node: 1
2121
resize_timeout: 00:20:00
2222
node_fill_type: pack
2323
autoscale:

convoy/autoscale.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
_UNBOUND_MAX_NODES = 16777216
3333
AutoscaleMinMax = collections.namedtuple(
3434
'AutoscaleMinMax', [
35-
'max_tasks_per_node',
35+
'task_slots_per_node',
3636
'min_target_dedicated',
3737
'min_target_low_priority',
3838
'max_target_dedicated',
@@ -79,7 +79,7 @@ def _formula_tasks(pool):
7979
task_type,
8080
pool.autoscale.scenario.required_sample_percentage,
8181
),
82-
'reqVMs = {}TaskAvg / maxTasksPerNode'.format(task_type),
82+
'reqVMs = {}TaskAvg / taskSlotsPerNode'.format(task_type),
8383
]
8484
if pool.autoscale.scenario.rebalance_preemption_percentage is not None:
8585
req_vms.extend([
@@ -101,7 +101,7 @@ def _formula_tasks(pool):
101101
'{}TaskAvg = avg(${}Tasks.GetSample(sli, {}))'.format(
102102
task_type, task_type,
103103
pool.autoscale.scenario.required_sample_percentage),
104-
'reqVMs = {}TaskAvg / maxTasksPerNode'.format(task_type),
104+
'reqVMs = {}TaskAvg / taskSlotsPerNode'.format(task_type),
105105
'reqVMs = ({}TaskAvg > 0 && reqVMs < 1) ? 1 : reqVMs'.format(
106106
task_type),
107107
]
@@ -186,7 +186,7 @@ def _formula_tasks(pool):
186186
pool.autoscale.scenario.bias_node_type))
187187
target_vms = ';\n'.join(target_vms)
188188
formula = [
189-
'maxTasksPerNode = {}'.format(minmax.max_tasks_per_node),
189+
'taskSlotsPerNode = {}'.format(minmax.task_slots_per_node),
190190
'minTargetDedicated = {}'.format(minmax.min_target_dedicated),
191191
'minTargetLowPriority = {}'.format(minmax.min_target_low_priority),
192192
'maxTargetDedicated = {}'.format(minmax.max_target_dedicated),
@@ -274,7 +274,7 @@ def _formula_day_of_week(pool):
274274
pool.autoscale.scenario.bias_node_type))
275275
target_vms = ';\n'.join(target_vms)
276276
formula = [
277-
'maxTasksPerNode = {}'.format(minmax.max_tasks_per_node),
277+
'taskSlotsPerNode = {}'.format(minmax.task_slots_per_node),
278278
'minTargetDedicated = {}'.format(minmax.min_target_dedicated),
279279
'minTargetLowPriority = {}'.format(minmax.min_target_low_priority),
280280
'maxTargetDedicated = {}'.format(minmax.max_target_dedicated),
@@ -327,7 +327,7 @@ def _get_minmax(pool):
327327
if max_inc_low_priority <= 0:
328328
max_inc_low_priority = _UNBOUND_MAX_NODES
329329
return AutoscaleMinMax(
330-
max_tasks_per_node=pool.max_tasks_per_node,
330+
task_slots_per_node=pool.task_slots_per_node,
331331
min_target_dedicated=min_target_dedicated,
332332
min_target_low_priority=min_target_low_priority,
333333
max_target_dedicated=max_target_dedicated,

convoy/batch.py

Lines changed: 69 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1313,7 +1313,7 @@ def list_pools(batch_client, config):
13131313
' * low priority:',
13141314
' * current: {}'.format(pool.current_low_priority_nodes),
13151315
' * target: {}'.format(pool.target_low_priority_nodes),
1316-
' * max tasks per node: {}'.format(pool.max_tasks_per_node),
1316+
' * tasks slots per node: {}'.format(pool.task_slots_per_node),
13171317
' * enable inter node communication: {}'.format(
13181318
pool.enable_inter_node_communication),
13191319
' * autoscale enabled: {}'.format(pool.enable_auto_scale),
@@ -1517,10 +1517,10 @@ def pool_stats(batch_client, config, pool_id=None):
15171517
if node.running_tasks_count is not None:
15181518
tasks_running.append(node.running_tasks_count)
15191519
total_running_tasks = sum(tasks_running)
1520-
runnable_task_slots = runnable_nodes * pool.max_tasks_per_node
1520+
runnable_task_slots = runnable_nodes * pool.task_slots_per_node
15211521
total_task_slots = (
15221522
pool.current_dedicated_nodes + pool.current_low_priority_nodes
1523-
) * pool.max_tasks_per_node
1523+
) * pool.task_slots_per_node
15241524
busy_task_slots_fraction = (
15251525
0 if runnable_task_slots == 0 else
15261526
total_running_tasks / runnable_task_slots
@@ -1998,17 +1998,32 @@ def job_stats(batch_client, config, jobid=None):
19981998
task_wall_times = []
19991999
task_counts = batchmodels.TaskCounts(
20002000
active=0, running=0, completed=0, succeeded=0, failed=0)
2001+
task_slots = batchmodels.TaskSlotCounts(
2002+
active=0, running=0, completed=0, succeeded=0, failed=0)
20012003
total_tasks = 0
2004+
total_slots = 0
20022005
for job in jobs:
20032006
job_count += 1
20042007
# get task counts
20052008
tc = batch_client.job.get_task_counts(job_id=job.id)
2006-
task_counts.active += tc.active
2007-
task_counts.running += tc.running
2008-
task_counts.completed += tc.completed
2009-
task_counts.succeeded += tc.succeeded
2010-
task_counts.failed += tc.failed
2011-
total_tasks += tc.active + tc.running + tc.completed
2009+
task_counts.active += tc.task_counts.active
2010+
task_counts.running += tc.task_counts.running
2011+
task_counts.completed += tc.task_counts.completed
2012+
task_counts.succeeded += tc.task_counts.succeeded
2013+
task_counts.failed += tc.task_counts.failed
2014+
total_tasks += (
2015+
tc.task_counts.active + tc.task_counts.running +
2016+
tc.task_counts.completed
2017+
)
2018+
task_slots.active += tc.task_slot_counts.active
2019+
task_slots.running += tc.task_slot_counts.running
2020+
task_slots.completed += tc.task_slot_counts.completed
2021+
task_slots.succeeded += tc.task_slot_counts.succeeded
2022+
task_slots.failed += tc.task_slot_counts.failed
2023+
total_slots = (
2024+
tc.task_slot_counts.active + tc.task_slot_counts.running +
2025+
tc.task_slot_counts.completed
2026+
)
20122027
if job.execution_info.end_time is not None:
20132028
job_times.append(
20142029
(job.execution_info.end_time -
@@ -2054,6 +2069,29 @@ def job_stats(batch_client, config, jobid=None):
20542069
100 * task_counts.failed / task_counts.completed
20552070
if task_counts.completed > 0 else 0
20562071
),
2072+
'* Total slots: {}'.format(total_slots),
2073+
' * Active: {0} ({1:.2f}% of total)'.format(
2074+
task_slots.active,
2075+
100 * task_slots.active / total_slots if total_slots > 0 else 0
2076+
),
2077+
' * Running: {0} ({1:.2f}% of total)'.format(
2078+
task_slots.running,
2079+
100 * task_slots.running / total_slots if total_slots > 0 else 0
2080+
),
2081+
' * Completed: {0} ({1:.2f}% of total)'.format(
2082+
task_slots.completed,
2083+
100 * task_slots.completed / total_slots if total_slots > 0 else 0
2084+
),
2085+
' * Succeeded: {0} ({1:.2f}% of completed)'.format(
2086+
task_slots.succeeded,
2087+
100 * task_slots.succeeded / task_slots.completed
2088+
if task_slots.completed > 0 else 0
2089+
),
2090+
' * Failed: {0} ({1:.2f}% of completed)'.format(
2091+
task_slots.failed,
2092+
100 * task_slots.failed / task_slots.completed
2093+
if task_slots.completed > 0 else 0
2094+
),
20572095
]
20582096
if len(job_times) > 0:
20592097
log.extend([
@@ -3883,7 +3921,7 @@ def get_task_counts(batch_client, config, jobid=None):
38833921
raw = {}
38843922
for job in jobs:
38853923
jobid = settings.job_id(job)
3886-
log = ['task counts for job {}'.format(jobid)]
3924+
log = ['task counts and slot counts for job {}'.format(jobid)]
38873925
try:
38883926
if settings.raw(config):
38893927
raw[jobid] = util.print_raw_output(
@@ -3901,11 +3939,25 @@ def get_task_counts(batch_client, config, jobid=None):
39013939
raise
39023940
else:
39033941
if not settings.raw(config):
3904-
log.append('* active: {}'.format(tc.active))
3905-
log.append('* running: {}'.format(tc.running))
3906-
log.append('* completed: {}'.format(tc.completed))
3907-
log.append(' * succeeded: {}'.format(tc.succeeded))
3908-
log.append(' * failed: {}'.format(tc.failed))
3942+
log.append('* task counts:')
3943+
log.append(' * active: {}'.format(tc.task_counts.active))
3944+
log.append(' * running: {}'.format(tc.task_counts.running))
3945+
log.append(' * completed: {}'.format(
3946+
tc.task_counts.completed))
3947+
log.append(' * succeeded: {}'.format(
3948+
tc.task_counts.succeeded))
3949+
log.append(' * failed: {}'.format(tc.task_counts.failed))
3950+
log.append('* task slots:')
3951+
log.append(' * active: {}'.format(
3952+
tc.task_slot_counts.active))
3953+
log.append(' * running: {}'.format(
3954+
tc.task_slot_counts.running))
3955+
log.append(' * completed: {}'.format(
3956+
tc.task_slot_counts.completed))
3957+
log.append(' * succeeded: {}'.format(
3958+
tc.task_slot_counts.succeeded))
3959+
log.append(' * failed: {}'.format(
3960+
tc.task_slot_counts.failed))
39093961
logger.info(os.linesep.join(log))
39103962
if util.is_not_empty(raw):
39113963
util.print_raw_json(raw)
@@ -5429,15 +5481,15 @@ def add_jobs(
54295481
if recurrence.job_manager.allow_low_priority_node
54305482
else 0
54315483
)
5432-
total_slots = cloud_pool.max_tasks_per_node * total_vms
5484+
total_slots = cloud_pool.task_slots_per_node * total_vms
54335485
else:
54345486
total_vms = (
54355487
pool.vm_count.dedicated +
54365488
pool.vm_count.low_priority
54375489
if recurrence.job_manager.allow_low_priority_node
54385490
else 0
54395491
)
5440-
total_slots = pool.max_tasks_per_node * total_vms
5492+
total_slots = pool.task_slots_per_node * total_vms
54415493
if total_slots == 1:
54425494
logger.error(
54435495
('Only 1 scheduling slot available which is '

convoy/fleet.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1454,7 +1454,7 @@ def _construct_pool_object(
14541454
pool_settings.vm_count.low_priority if not asenable else None
14551455
),
14561456
resize_timeout=pool_settings.resize_timeout if not asenable else None,
1457-
max_tasks_per_node=pool_settings.max_tasks_per_node,
1457+
task_slots_per_node=pool_settings.task_slots_per_node,
14581458
enable_inter_node_communication=pool_settings.
14591459
inter_node_communication_enabled,
14601460
start_task=batchmodels.StartTask(
@@ -1778,7 +1778,7 @@ def _construct_auto_pool_specification(
17781778
poolspec = batchmodels.PoolSpecification(
17791779
vm_size=pool.vm_size,
17801780
virtual_machine_configuration=pool.virtual_machine_configuration,
1781-
max_tasks_per_node=pool.max_tasks_per_node,
1781+
task_slots_per_node=pool.task_slots_per_node,
17821782
task_scheduling_policy=pool.task_scheduling_policy,
17831783
resize_timeout=pool.resize_timeout,
17841784
target_dedicated_nodes=pool.target_dedicated_nodes,
@@ -2783,9 +2783,9 @@ def _adjust_settings_for_pool_creation(config):
27832783
raise ValueError(
27842784
'vm_count dedicated should exceed 1 for glusterfs '
27852785
'on compute')
2786-
if pool.max_tasks_per_node > 1:
2786+
if pool.task_slots_per_node > 1:
27872787
raise ValueError(
2788-
'max_tasks_per_node cannot exceed 1 for glusterfs '
2788+
'task_slots_per_node cannot exceed 1 for glusterfs '
27892789
'on compute')
27902790
num_gluster += 1
27912791
try:

convoy/settings.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@
205205
)
206206
PoolSettings = collections.namedtuple(
207207
'PoolSettings', [
208-
'id', 'vm_size', 'vm_count', 'resize_timeout', 'max_tasks_per_node',
208+
'id', 'vm_size', 'vm_count', 'resize_timeout', 'task_slots_per_node',
209209
'inter_node_communication_enabled', 'vm_configuration',
210210
'reboot_on_start_task_failed', 'attempt_recovery_on_unusable',
211211
'block_until_all_global_resources_loaded',
@@ -1277,7 +1277,7 @@ def pool_settings(config):
12771277
:return: pool settings from specification
12781278
"""
12791279
conf = pool_specification(config)
1280-
max_tasks_per_node = _kv_read(conf, 'max_tasks_per_node', default=1)
1280+
task_slots_per_node = _kv_read(conf, 'task_slots_per_node', default=1)
12811281
resize_timeout = _kv_read_checked(conf, 'resize_timeout')
12821282
if util.is_not_empty(resize_timeout):
12831283
resize_timeout = util.convert_string_to_timedelta(resize_timeout)
@@ -1446,7 +1446,7 @@ def pool_settings(config):
14461446
vm_size=_pool_vm_size(config),
14471447
vm_count=_pool_vm_count(config),
14481448
resize_timeout=resize_timeout,
1449-
max_tasks_per_node=max_tasks_per_node,
1449+
task_slots_per_node=task_slots_per_node,
14501450
inter_node_communication_enabled=inter_node_communication_enabled,
14511451
vm_configuration=_populate_pool_vm_configuration(config),
14521452
reboot_on_start_task_failed=reboot_on_start_task_failed,

docs/13-batch-shipyard-configuration-pool.md

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ pool_specification:
3232
vm_count:
3333
dedicated: 4
3434
low_priority: 8
35-
max_tasks_per_node: 1
35+
task_slots_per_node: 1
3636
resize_timeout: 00:20:00
3737
node_fill_type: pack
3838
autoscale:
@@ -242,15 +242,15 @@ of nodes). The format for this property is a timedelta with a string
242242
representation of "d.HH:mm:ss". "HH:mm:ss" is required, but "d" is optional,
243243
if specified. If not specified, the default is 15 minutes. This should not
244244
be specified (and is ignored) for `autoscale` enabled pools.
245-
* (optional) `max_tasks_per_node` is the maximum number of concurrent tasks
246-
that can be running at any one time on a compute node. This defaults to a
247-
value of 1 if not specified. The maximum value for the property that Azure
248-
Batch will accept is `4 x <# cores per compute node>`. For instance, for a
249-
`STANDARD_F2` instance, because the virtual machine has 2 cores, the maximum
250-
allowable value for this property would be `8`.
245+
* (optional) `task_slots_per_node` is the maximum number of concurrent task
246+
slots configured for a single compute node. This defaults to a value of `1`
247+
if not specified. The maximum value for the property that Azure Batch will
248+
accept is `4 x <# vCPUs per compute node>` or `256`. For instance, for a
249+
`STANDARD_D2_V3` instance, because the virtual machine has 2 vCPUs, the
250+
maximum allowable value for this property would be `8`.
251251
* (optional) `node_fill_type` is the task scheduling compute node fill type
252252
policy to apply. `pack`, which is the default, attempts to pack the
253-
maximum number of tasks on a node (controlled through `max_tasks_per_node`
253+
maximum number of tasks on a node (controlled through `task_slots_per_node`
254254
before scheduling tasks to another node). `spread` will schedule tasks
255255
evenly across compute nodes before packing.
256256
* (optional) `autoscale` designates the autoscale settings for the pool. If
@@ -356,12 +356,13 @@ The default, if not specified, is `false`.
356356
* (optional) `reboot_on_start_task_failed` allows Batch Shipyard to reboot the
357357
compute node in case there is a transient failure in node preparation (e.g.,
358358
network timeout, resolution failure or download problem). This defaults to
359+
`false`. This option is ignored for `auto_pool` where the behvaior is always
359360
`false`.
360361
* (optional) `attempt_recovery_on_unusable` allows Batch Shipyard to attempt
361362
to recover nodes that enter `unusable` state automatically. Note that
362363
enabling this option can lead to infinite wait on `pool add` or `pool resize`
363364
with `--wait`. This defaults to `false` and is ignored for `custom_image`
364-
where the behavior is always `false`.
365+
and `auto_pool` where the behavior is always `false`.
365366
* (optional) `upload_diagnostics_logs_on_unusable` allows Batch Shipyard
366367
to attempt upload of diagnostics logs for nodes that have entered unusable
367368
state during provisioning to the storage account designated under the

docs/14-batch-shipyard-configuration-jobs.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -422,15 +422,15 @@ a set interval.
422422
nodes provisioned.
423423
* (optional) `run_exclusive` forces the job manager to run on a compute
424424
node where there are no other tasks running. The default is `false`.
425-
This is only relevant when the pool's `max_tasks_per_node` setting is
426-
greater than 1.
425+
This is only relevant when the pool's `task_slots_per_node` setting
426+
is greater than 1.
427427
* (optional) `monitor_task_completion` allows the job manager to
428428
monitor the tasks in the job for completion instead of relying on
429429
`auto_complete`. The advantage for doing so is that the job can move
430430
much more quickly into completed state thus allowing the next job
431431
recurrence to be created for very small values of
432432
`recurrence_interval`. In order to properly utilize this feature,
433-
you must either set your pool's `max_tasks_per_node` to greater
433+
you must either set your pool's `task_slots_per_node` to greater
434434
than 1 or have more than one compute node in your pool. If neither
435435
of these conditions are met, then the tasks that the job manager
436436
creates will be blocked as there will be no free scheduling slots
@@ -752,7 +752,7 @@ information and terminology definitions.
752752
* (optional) `exclusive` specifies if each task within the task group
753753
must not be co-scheduled with other running tasks on compute nodes.
754754
Effectively this excludes pools as scheduling targets that have
755-
been provisioned with the setting `max_tasks_per_node` greater
755+
been provisioned with the setting `task_slots_per_node` greater
756756
than `1`.
757757
* (optional) `gpu` specifies if tasks within the task group should
758758
be scheduled on a compute node that has a GPU. Note that specifying

docs/68-batch-shipyard-federation.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ non-native and use the same setting consistently across all pools and all job
276276
* `arm_image_id` under `vm_configuration`:`custom_image` will allow
277277
routing of task groups with `custom_image_arm_id` constraints.
278278
* `vm_size` will be impacted by `compute_node` job constraints.
279-
* `max_tasks_per_node` will impact available scheduling slots and the
279+
* `task_slots_per_node` will impact available scheduling slots and the
280280
`compute_node`:`exclusive` constraint.
281281
* `autoscale` changes behavior of scheduling across various constraints.
282282
* `inter_node_communication` enabled pools will allow tasks that contain

docs/96-troubleshooting-guide.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ The `pool images update` command runs as a normal job if your pool is
188188
comprised entirely of dedicated compute nodes. Thus, your compute
189189
nodes must be able to accommodate this update job and task. If your pool only
190190
has one node in it, it will run as a single task under a job. If the node in
191-
this pool is busy and the `max_tasks_per_node` in your `pool.yaml` is either
191+
this pool is busy and the `task_slots_per_node` in your `pool.yaml` is either
192192
unspecified or set to 1, then it will be blocked behind the running task.
193193

194194
For pools with more than 1 node, then the update images command will run
@@ -199,7 +199,7 @@ the `pool images update` command is issued. If before the task can be
199199
scheduled, the pool is resized down and the number of nodes decreases, then
200200
the update container images job will not be able to execute and will stay
201201
active until the number of compute nodes reaches the prior number.
202-
Additionally, if `max_tasks_per_node` is set to 1 or unspecified in
202+
Additionally, if `task_slots_per_node` is set to 1 or unspecified in
203203
`pool.yaml` and any task is running on any node, the update container images
204204
job will be blocked until that task completes.
205205

0 commit comments

Comments
 (0)