Skip to content

Commit 48bd9f8

Browse files
committed
Add ability to schedule splits based on Task load, not Node load.
1 parent 3829cb9 commit 48bd9f8

File tree

9 files changed

+257
-4
lines changed

9 files changed

+257
-4
lines changed

presto-docs/src/main/sphinx/admin/properties-session.rst

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,20 @@ Use this to optimize the ``map_filter()`` and ``map_subset()`` function.
434434

435435
It controls if subfields access is executed at the data source or not.
436436

437+
``schedule_splits_based_on_task_load``
438+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
439+
* **Type:** ``boolean``
440+
* **Default value:** ``false``
441+
442+
If true then splits are scheduled to the tasks based on task load, rather than on the node load.
443+
This is particularly useful for the native worker as it runs splits for tasks differently than the java worker.
444+
The corresponding configuration property is :ref:`admin/properties:\`\`node-scheduler.max-splits-per-task\`\``.
445+
446+
Set to ``true`` to use as shown in this example:
447+
448+
``SET SESSION schedule_splits_based_on_task_load=true;``
449+
450+
437451
JDBC Properties
438452
---------------
439453

presto-docs/src/main/sphinx/admin/properties.rst

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,30 @@ due to splits not being balanced across workers. Ideally, it should be set
725725
such that there is always at least one split waiting to be processed, but
726726
not higher.
727727

728+
``node-scheduler.max-splits-per-task``
729+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
730+
731+
* **Type:** ``integer``
732+
* **Default value:** ``10``
733+
734+
The target value for the number of splits that can be running for
735+
each task, assuming all splits have the standard split weight.
736+
737+
Using a higher value is recommended if tasks parallelism is higher than 10.
738+
Increasing this value may improve query latency by ensuring that the workers
739+
have enough splits to keep them fully utilized.
740+
741+
When connectors do support weight based split scheduling, the number of splits
742+
assigned will depend on the weight of the individual splits. If splits are
743+
small, more of them are allowed to be assigned to each worker to compensate.
744+
745+
Setting this too high will waste memory and may result in lower performance
746+
due to splits not being balanced across workers. Ideally, it should be set
747+
such that there is always at least one split waiting to be processed, but
748+
not higher.
749+
750+
The corresponding session property is :ref:`admin/properties-session:\`\`schedule_splits_based_on_task_load\`\``.
751+
728752
``node-scheduler.max-pending-splits-per-task``
729753
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
730754

presto-main-base/src/main/java/com/facebook/presto/SystemSessionProperties.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,7 @@ public final class SystemSessionProperties
244244
public static final String AGGREGATION_IF_TO_FILTER_REWRITE_STRATEGY = "aggregation_if_to_filter_rewrite_strategy";
245245
public static final String JOINS_NOT_NULL_INFERENCE_STRATEGY = "joins_not_null_inference_strategy";
246246
public static final String RESOURCE_AWARE_SCHEDULING_STRATEGY = "resource_aware_scheduling_strategy";
247+
public static final String SCHEDULE_SPLITS_BASED_ON_TASK_LOAD = "schedule_splits_based_on_task_load";
247248
public static final String HEAP_DUMP_ON_EXCEEDED_MEMORY_LIMIT_ENABLED = "heap_dump_on_exceeded_memory_limit_enabled";
248249
public static final String EXCEEDED_MEMORY_LIMIT_HEAP_DUMP_FILE_DIRECTORY = "exceeded_memory_limit_heap_dump_file_directory";
249250
public static final String DISTRIBUTED_TRACING_MODE = "distributed_tracing_mode";
@@ -1422,6 +1423,11 @@ public SystemSessionProperties(
14221423
false,
14231424
value -> ResourceAwareSchedulingStrategy.valueOf(((String) value).toUpperCase()),
14241425
ResourceAwareSchedulingStrategy::name),
1426+
booleanProperty(
1427+
SCHEDULE_SPLITS_BASED_ON_TASK_LOAD,
1428+
"Schedule splits based on task load, rather than on the node load.",
1429+
nodeSchedulerConfig.isScheduleSplitsBasedOnTaskLoad(),
1430+
false),
14251431
stringProperty(
14261432
ANALYZER_TYPE,
14271433
"Analyzer type to use.",
@@ -2917,6 +2923,11 @@ public static ResourceAwareSchedulingStrategy getResourceAwareSchedulingStrategy
29172923
return session.getSystemProperty(RESOURCE_AWARE_SCHEDULING_STRATEGY, ResourceAwareSchedulingStrategy.class);
29182924
}
29192925

2926+
public static Boolean isScheduleSplitsBasedOnTaskLoad(Session session)
2927+
{
2928+
return session.getSystemProperty(SCHEDULE_SPLITS_BASED_ON_TASK_LOAD, Boolean.class);
2929+
}
2930+
29202931
public static String getAnalyzerType(Session session)
29212932
{
29222933
return session.getSystemProperty(ANALYZER_TYPE, String.class);

presto-main-base/src/main/java/com/facebook/presto/execution/scheduler/NodeAssignmentStats.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,12 @@ public long getQueuedSplitsWeightForStage(InternalNode node)
7070
return stageInfo == null ? 0 : stageInfo.getQueuedSplitsWeight();
7171
}
7272

73+
public long getAssignedSplitsWeightForStage(InternalNode node)
74+
{
75+
PendingSplitInfo stageInfo = stageQueuedSplitInfo.get(node.getNodeIdentifier());
76+
return stageInfo == null ? 0 : stageInfo.getAssignedSplitsWeight();
77+
}
78+
7379
public int getUnacknowledgedSplitCountForStage(InternalNode node)
7480
{
7581
PendingSplitInfo stageInfo = stageQueuedSplitInfo.get(node.getNodeIdentifier());

presto-main-base/src/main/java/com/facebook/presto/execution/scheduler/NodeScheduler.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
import static com.facebook.airlift.concurrent.MoreFutures.whenAnyCompleteCancelOthers;
6262
import static com.facebook.presto.SystemSessionProperties.getMaxUnacknowledgedSplitsPerTask;
6363
import static com.facebook.presto.SystemSessionProperties.getResourceAwareSchedulingStrategy;
64+
import static com.facebook.presto.SystemSessionProperties.isScheduleSplitsBasedOnTaskLoad;
6465
import static com.facebook.presto.execution.scheduler.NodeSchedulerConfig.NetworkTopologyType;
6566
import static com.facebook.presto.execution.scheduler.NodeSchedulerConfig.ResourceAwareSchedulingStrategy;
6667
import static com.facebook.presto.execution.scheduler.NodeSchedulerConfig.ResourceAwareSchedulingStrategy.TTL;
@@ -91,6 +92,7 @@ public class NodeScheduler
9192
private final int minCandidates;
9293
private final boolean includeCoordinator;
9394
private final long maxSplitsWeightPerNode;
95+
private final long maxSplitsWeightPerTask;
9496
private final long maxPendingSplitsWeightPerTask;
9597
private final NodeTaskMap nodeTaskMap;
9698
private final boolean useNetworkTopology;
@@ -146,6 +148,7 @@ public NodeScheduler(
146148
int maxPendingSplitsPerTask = config.getMaxPendingSplitsPerTask();
147149
checkArgument(maxSplitsPerNode >= maxPendingSplitsPerTask, "maxSplitsPerNode must be > maxPendingSplitsPerTask");
148150
this.maxSplitsWeightPerNode = SplitWeight.rawValueForStandardSplitCount(maxSplitsPerNode);
151+
this.maxSplitsWeightPerTask = SplitWeight.rawValueForStandardSplitCount(config.getMaxSplitsPerTask());
149152
this.maxPendingSplitsWeightPerTask = SplitWeight.rawValueForStandardSplitCount(maxPendingSplitsPerTask);
150153
this.nodeTaskMap = requireNonNull(nodeTaskMap, "nodeTaskMap is null");
151154
this.useNetworkTopology = !config.getNetworkTopology().equals(NetworkTopologyType.LEGACY);
@@ -231,9 +234,11 @@ public NodeSelector createNodeSelector(Session session, ConnectorId connectorId,
231234
nodeSelectionStats,
232235
nodeTaskMap,
233236
includeCoordinator,
237+
isScheduleSplitsBasedOnTaskLoad(session),
234238
nodeMap,
235239
minCandidates,
236240
maxSplitsWeightPerNode,
241+
maxSplitsWeightPerTask,
237242
maxPendingSplitsWeightPerTask,
238243
maxUnacknowledgedSplitsPerTask,
239244
maxTasksPerStage,

presto-main-base/src/main/java/com/facebook/presto/execution/scheduler/NodeSchedulerConfig.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ public static class NetworkTopologyType
3333
private int minCandidates = 10;
3434
private boolean includeCoordinator = true;
3535
private int maxSplitsPerNode = 100;
36+
private int maxSplitsPerTask = 10;
37+
private boolean scheduleSplitsBasedOnTaskLoad;
3638
private int maxPendingSplitsPerTask = 10;
3739
private int maxUnacknowledgedSplitsPerTask = 500;
3840
private String networkTopology = NetworkTopologyType.LEGACY;
@@ -106,6 +108,33 @@ public NodeSchedulerConfig setMaxSplitsPerNode(int maxSplitsPerNode)
106108
return this;
107109
}
108110

111+
public int getMaxSplitsPerTask()
112+
{
113+
return maxSplitsPerTask;
114+
}
115+
116+
@Config("node-scheduler.max-splits-per-task")
117+
@ConfigDescription("The number of splits weighted at the standard split weight that are allowed to be scheduled for each task " +
118+
"when scheduling splits based on the task load.")
119+
public NodeSchedulerConfig setMaxSplitsPerTask(int maxSplitsPerTask)
120+
{
121+
this.maxSplitsPerTask = maxSplitsPerTask;
122+
return this;
123+
}
124+
125+
public boolean isScheduleSplitsBasedOnTaskLoad()
126+
{
127+
return scheduleSplitsBasedOnTaskLoad;
128+
}
129+
130+
@Config("node-scheduler.schedule-splits-based-on-task-load")
131+
@ConfigDescription("Schedule splits based on task load, rather than on the node load")
132+
public NodeSchedulerConfig setScheduleSplitsBasedOnTaskLoad(boolean scheduleSplitsBasedOnTaskLoad)
133+
{
134+
this.scheduleSplitsBasedOnTaskLoad = scheduleSplitsBasedOnTaskLoad;
135+
return this;
136+
}
137+
109138
@Min(1)
110139
public int getMaxUnacknowledgedSplitsPerTask()
111140
{

presto-main-base/src/main/java/com/facebook/presto/execution/scheduler/nodeSelection/SimpleNodeSelector.java

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import com.facebook.airlift.log.Logger;
1717
import com.facebook.presto.execution.NodeTaskMap;
1818
import com.facebook.presto.execution.RemoteTask;
19+
import com.facebook.presto.execution.TaskStatus;
1920
import com.facebook.presto.execution.scheduler.BucketNodeMap;
2021
import com.facebook.presto.execution.scheduler.InternalNodeInfo;
2122
import com.facebook.presto.execution.scheduler.NodeAssignmentStats;
@@ -35,8 +36,10 @@
3536
import com.google.common.collect.Multimap;
3637
import com.google.common.util.concurrent.ListenableFuture;
3738

39+
import java.util.HashMap;
3840
import java.util.HashSet;
3941
import java.util.List;
42+
import java.util.Map;
4043
import java.util.Objects;
4144
import java.util.Optional;
4245
import java.util.OptionalInt;
@@ -71,9 +74,11 @@ public class SimpleNodeSelector
7174
private final NodeSelectionStats nodeSelectionStats;
7275
private final NodeTaskMap nodeTaskMap;
7376
private final boolean includeCoordinator;
77+
private final boolean scheduleSplitsBasedOnTaskLoad;
7478
private final AtomicReference<Supplier<NodeMap>> nodeMap;
7579
private final int minCandidates;
7680
private final long maxSplitsWeightPerNode;
81+
private final long maxSplitsWeightPerTask;
7782
private final long maxPendingSplitsWeightPerTask;
7883
private final int maxUnacknowledgedSplitsPerTask;
7984
private final int maxTasksPerStage;
@@ -84,9 +89,11 @@ public SimpleNodeSelector(
8489
NodeSelectionStats nodeSelectionStats,
8590
NodeTaskMap nodeTaskMap,
8691
boolean includeCoordinator,
92+
boolean scheduleSplitsBasedOnTaskLoad,
8793
Supplier<NodeMap> nodeMap,
8894
int minCandidates,
8995
long maxSplitsWeightPerNode,
96+
long maxSplitsWeightPerTask,
9097
long maxPendingSplitsWeightPerTask,
9198
int maxUnacknowledgedSplitsPerTask,
9299
int maxTasksPerStage,
@@ -96,9 +103,11 @@ public SimpleNodeSelector(
96103
this.nodeSelectionStats = requireNonNull(nodeSelectionStats, "nodeSelectionStats is null");
97104
this.nodeTaskMap = requireNonNull(nodeTaskMap, "nodeTaskMap is null");
98105
this.includeCoordinator = includeCoordinator;
106+
this.scheduleSplitsBasedOnTaskLoad = scheduleSplitsBasedOnTaskLoad;
99107
this.nodeMap = new AtomicReference<>(nodeMap);
100108
this.minCandidates = minCandidates;
101109
this.maxSplitsWeightPerNode = maxSplitsWeightPerNode;
110+
this.maxSplitsWeightPerTask = maxSplitsWeightPerTask;
102111
this.maxPendingSplitsWeightPerTask = maxPendingSplitsWeightPerTask;
103112
this.maxUnacknowledgedSplitsPerTask = maxUnacknowledgedSplitsPerTask;
104113
checkArgument(maxUnacknowledgedSplitsPerTask > 0, "maxUnacknowledgedSplitsPerTask must be > 0, found: %s", maxUnacknowledgedSplitsPerTask);
@@ -149,6 +158,11 @@ public SplitPlacementResult computeAssignments(Set<Split> splits, List<RemoteTas
149158
Set<InternalNode> blockedExactNodes = new HashSet<>();
150159
boolean splitWaitingForAnyNode = false;
151160

161+
Optional<ToLongFunction<InternalNode>> taskLoadSplitWeightProvider = Optional.empty();
162+
if (this.scheduleSplitsBasedOnTaskLoad) {
163+
taskLoadSplitWeightProvider = Optional.of(createTaskLoadSplitWeightProvider(existingTasks, assignmentStats));
164+
}
165+
152166
NodeProvider nodeProvider = nodeMap.getNodeProvider(maxPreferredNodes);
153167
OptionalInt preferredNodeCount = OptionalInt.empty();
154168
for (Split split : splits) {
@@ -179,9 +193,16 @@ public SplitPlacementResult computeAssignments(Set<Split> splits, List<RemoteTas
179193
}
180194

181195
SplitWeight splitWeight = split.getSplitWeight();
182-
Optional<InternalNodeInfo> chosenNodeInfo = chooseLeastBusyNode(splitWeight, candidateNodes, assignmentStats::getTotalSplitsWeight, preferredNodeCount, maxSplitsWeightPerNode, assignmentStats);
183-
if (!chosenNodeInfo.isPresent()) {
184-
chosenNodeInfo = chooseLeastBusyNode(splitWeight, candidateNodes, assignmentStats::getQueuedSplitsWeightForStage, preferredNodeCount, maxPendingSplitsWeightPerTask, assignmentStats);
196+
Optional<InternalNodeInfo> chosenNodeInfo = Optional.empty();
197+
198+
if (taskLoadSplitWeightProvider.isPresent()) {
199+
chosenNodeInfo = chooseLeastBusyNode(splitWeight, candidateNodes, taskLoadSplitWeightProvider.get(), preferredNodeCount, maxSplitsWeightPerTask, assignmentStats);
200+
}
201+
else {
202+
chosenNodeInfo = chooseLeastBusyNode(splitWeight, candidateNodes, assignmentStats::getTotalSplitsWeight, preferredNodeCount, maxSplitsWeightPerNode, assignmentStats);
203+
if (!chosenNodeInfo.isPresent()) {
204+
chosenNodeInfo = chooseLeastBusyNode(splitWeight, candidateNodes, assignmentStats::getQueuedSplitsWeightForStage, preferredNodeCount, maxPendingSplitsWeightPerTask, assignmentStats);
205+
}
185206
}
186207

187208
if (chosenNodeInfo.isPresent()) {
@@ -223,6 +244,28 @@ public SplitPlacementResult computeAssignments(Set<Split> splits, List<RemoteTas
223244
return selectDistributionNodes(nodeMap.get().get(), nodeTaskMap, maxSplitsWeightPerNode, maxPendingSplitsWeightPerTask, maxUnacknowledgedSplitsPerTask, splits, existingTasks, bucketNodeMap, nodeSelectionStats);
224245
}
225246

247+
private ToLongFunction<InternalNode> createTaskLoadSplitWeightProvider(List<RemoteTask> existingTasks, NodeAssignmentStats assignmentStats)
248+
{
249+
// Create a map from nodeId to RemoteTask for efficient lookup
250+
Map<String, RemoteTask> tasksByNodeId = new HashMap<>();
251+
for (RemoteTask task : existingTasks) {
252+
tasksByNodeId.put(task.getNodeId(), task);
253+
}
254+
255+
return node -> {
256+
RemoteTask remoteTask = tasksByNodeId.get(node.getNodeIdentifier());
257+
if (remoteTask == null) {
258+
// No task for this node, return only the queued splits weight for the stage
259+
return assignmentStats.getAssignedSplitsWeightForStage(node);
260+
}
261+
262+
TaskStatus taskStatus = remoteTask.getTaskStatus();
263+
return taskStatus.getQueuedPartitionedSplitsWeight() +
264+
taskStatus.getRunningPartitionedSplitsWeight() +
265+
assignmentStats.getAssignedSplitsWeightForStage(node);
266+
};
267+
}
268+
226269
protected Optional<InternalNodeInfo> chooseLeastBusyNode(SplitWeight splitWeight, List<InternalNode> candidateNodes, ToLongFunction<InternalNode> splitWeightProvider, OptionalInt preferredNodeCount, long maxSplitsWeight, NodeAssignmentStats assignmentStats)
227270
{
228271
long minWeight = Long.MAX_VALUE;

0 commit comments

Comments
 (0)