Skip to content

Commit 4f2b9f7

Browse files
author
pytorchbot
committed
2025-08-10 nightly release (6d56713)
1 parent a59a06e commit 4f2b9f7

37 files changed

+1425
-402
lines changed

backends/vulkan/op_registry.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -435,7 +435,19 @@ def register_2d_pool_op():
435435
)
436436
def register_convolution_op():
437437
return OpFeatures(
438-
inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
438+
inputs_storage=[
439+
utils.CHANNELS_PACKED_TEXTURE, # input
440+
utils.NO_STORAGE, # weight (prepacked)
441+
utils.NO_STORAGE, # bias (prepacked)
442+
utils.NO_STORAGE, # stride (non tensor)
443+
utils.NO_STORAGE, # padding (non tensor)
444+
utils.NO_STORAGE, # dilation (non tensor)
445+
utils.NO_STORAGE, # transposed (non tensor)
446+
utils.NO_STORAGE, # output_padding (non tensor)
447+
utils.NO_STORAGE, # groups (non tensor)
448+
utils.NO_STORAGE, # output_min (non tensor)
449+
utils.NO_STORAGE, # output_max (non tensor)
450+
],
439451
supports_resize=True,
440452
supports_prepacking=True,
441453
)
@@ -491,17 +503,9 @@ def register_view_ops():
491503
# for both texture and buffer storage types.
492504
@update_features(exir_ops.edge.aten.cat.default)
493505
def register_cat_op():
494-
def check_cat_node(node: torch.fx.Node) -> bool:
495-
inputs = node.args[0]
496-
if isinstance(inputs, (list, tuple)) and len(inputs) <= 3:
497-
return True
498-
499-
return False
500-
501506
return OpFeatures(
502507
inputs_storage=utils.ANY_STORAGE,
503508
supports_resize=True,
504-
are_node_inputs_supported_fn=check_cat_node,
505509
)
506510

507511

backends/vulkan/runtime/api/containers/Tensor.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,7 @@ void vTensorStorage::transition(
517517
vkapi::MemoryAccessFlags prev_access = last_access_.access;
518518

519519
const bool prev_written = (prev_access & vkapi::MemoryAccessType::WRITE) != 0;
520+
const bool cur_written = (cur_access & vkapi::MemoryAccessType::WRITE) != 0;
520521

521522
VkImageLayout cur_layout = VK_IMAGE_LAYOUT_UNDEFINED;
522523
VkImageLayout new_layout = VK_IMAGE_LAYOUT_UNDEFINED;
@@ -528,7 +529,13 @@ void vTensorStorage::transition(
528529
layout_changed = cur_layout != new_layout;
529530
}
530531

531-
if (prev_written || layout_changed) {
532+
// RAW: need to make sure current read sees previous writes
533+
// WAW: need to make sure the current write occurs after previous write so
534+
// the final value is correct.
535+
// WAR: need to make sure previous read does not read the value from the
536+
// current write.
537+
// RAR: no need for synchronization
538+
if (prev_written || cur_written || layout_changed) {
532539
VkPipelineStageFlags src_stage = vkapi::vk_stage(prev_stage);
533540
if (0u == src_stage) {
534541
src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

backends/vulkan/runtime/graph/ComputeGraph.cpp

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -799,6 +799,33 @@ void ComputeGraph::prepare_pipelines() {
799799
pipeline_descriptors_ = std::unordered_set<
800800
vkapi::ComputePipelineCache::Key,
801801
vkapi::ComputePipelineCache::Hasher>();
802+
803+
const size_t total_node_count = execute_nodes_.size();
804+
size_t init_threshold = config_.execute_initial_threshold_node_count;
805+
size_t count_threshold = config_.execute_threshold_node_count;
806+
807+
// If max command buffer count is set, we need to adjust the thresholds to
808+
// accommodate execution within the limit, if total command buffers with
809+
// current thresholds would exceed execute_max_cmds
810+
if (config_.execute_max_cmds > 0) {
811+
// Worse case scenario we have one command buffer for nodes before init
812+
// threshold and config_.execute_max_cmds - 1 command buffers for the rest
813+
// of dispatches
814+
815+
// If command buffers created after offsetting init_threshold would exceed
816+
// max command buffer count, we need to adjust init and count thresholds
817+
const bool slicing_exceeds_max_cmds = (total_node_count - init_threshold) >
818+
count_threshold * (config_.execute_max_cmds - 1);
819+
if (total_node_count > init_threshold && slicing_exceeds_max_cmds) {
820+
// Increase count threshold so remaining nodes after offsetting init fits
821+
// in config_.execute_max_cmds - 1
822+
count_threshold = static_cast<size_t>(ceil(
823+
(total_node_count - init_threshold) /
824+
double(config_.execute_max_cmds - 1)));
825+
}
826+
}
827+
828+
execute_threshold_node_count_ = count_threshold;
802829
}
803830

804831
void ComputeGraph::submit_current_cmd(const bool final_use) {
@@ -888,6 +915,7 @@ void ComputeGraph::execute() {
888915
context_->set_cmd(/*reusable = */ true);
889916

890917
context_->cmd_reset_querypool();
918+
const size_t total_node_count = execute_nodes_.size();
891919
uint32_t encoded_node_count = 0;
892920

893921
for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
@@ -900,11 +928,13 @@ void ComputeGraph::execute() {
900928
const bool reached_threshold =
901929
encoded_node_count >= config_.execute_initial_threshold_node_count &&
902930
((encoded_node_count - config_.execute_initial_threshold_node_count) %
903-
config_.execute_threshold_node_count ==
931+
execute_threshold_node_count_ ==
904932
0);
905933

906934
// Create a new command buffer when threashold is reached
907-
if (reached_threshold) {
935+
// But avoid it if this is the last node, since last cmd buf is submitted
936+
// after the loop
937+
if (reached_threshold && encoded_node_count != total_node_count) {
908938
context_->submit_cmd_to_gpu(VK_NULL_HANDLE, false);
909939
deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd()));
910940
context_->set_cmd(true);

backends/vulkan/runtime/graph/ComputeGraph.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,14 @@ class ComputeGraph final {
207207
// current Context's command buffer is submitted now.
208208
size_t staging_nbytes_in_cmd_ = 0;
209209

210+
// Represents the nodes to wait before submitting commands.
211+
// If command buffers created with config.execute_threshold_node_count exceeds
212+
// config.execute_max_cmds, then execute_threshold_node_count will be
213+
// increased to fit command buffers within the limit. Otherwise,
214+
// execute_threshold_node_count will be set to
215+
// config.execute_threshold_node_count.
216+
size_t execute_threshold_node_count_ = 0;
217+
210218
public:
211219
//
212220
// Accessors

backends/vulkan/runtime/graph/GraphConfig.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ struct GraphConfig final {
6161
// by taking more advantage of parallelism between the CPU and GPU.
6262
size_t execute_initial_threshold_node_count = 0;
6363

64+
// If this number is greater than 0 then, during execute create at most this
65+
// many command buffers.
66+
size_t execute_max_cmds = 0;
67+
6468
vkapi::Adapter* external_adapter;
6569

6670
// Generate a default graph config with pre-configured settings

backends/vulkan/runtime/graph/ops/ExecuteNode.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class ExecuteNode {
4343
friend class ComputeGraph;
4444

4545
public:
46-
using ResizeFunction = const std::function<void(
46+
using ResizeFunction = std::function<void(
4747
ComputeGraph*,
4848
const std::vector<ArgGroup>&,
4949
const std::vector<ValueRef>&)>;

backends/vulkan/runtime/graph/ops/glsl/concat_buffer.glsl

Lines changed: 44 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,21 @@ layout(std430) buffer;
2020

2121
#include "indexing_utils.h"
2222

23-
${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
23+
${layout_declare_tensor(B, "rw", "t_out", DTYPE, "buffer")}
2424

2525
$for i in range(NUM_INPUTS):
26-
${layout_declare_tensor(B, "r", "t_in" + str(i + 1), DTYPE, "buffer")}
26+
${layout_declare_tensor(B, "r", "t_inp" + str(i), DTYPE, "buffer")}
27+
28+
${layout_declare_tensor(B, "r", "t_concat_offset", "int", "buffer")}
2729

2830
${layout_declare_ubo(B, "int", "concat_dim")}
2931

3032
${layout_declare_ubo(B, "ivec4", "out_sizes")}
3133
${layout_declare_ubo(B, "ivec4", "out_strides")}
3234

3335
$for i in range(NUM_INPUTS):
34-
${layout_declare_ubo(B, "ivec4", "in" + str(i+1) + "_sizes")}
35-
${layout_declare_ubo(B, "ivec4", "in" + str(i+1) + "_strides")}
36+
${layout_declare_ubo(B, "ivec4", "inp" + str(i) + "_sizes")}
37+
${layout_declare_ubo(B, "ivec4", "inp" + str(i) + "_strides")}
3638

3739
${layout_declare_ubo(B, "int", "out_numel")}
3840

@@ -42,28 +44,53 @@ const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
4244

4345
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
4446

47+
#define NUM_INPUTS ${NUM_INPUTS}
48+
49+
#include "concat_utils.glslh"
50+
51+
/*
52+
* This shader template concatenates up to NUM_INPUT input tensors to the
53+
* output tensor along the concat_dim. Elements from the input tensor will
54+
* be inserted along the output's concat_dim starting at concat_offset.
55+
*/
4556
void main() {
46-
const int out_bufi = ivec3(gl_GlobalInvocationID).x;
47-
if (out_bufi >= out_numel) {
57+
const int tid = ivec3(gl_GlobalInvocationID).x;
58+
59+
// The 1-3 input tensors are interpreted as one concatenated tensor ("volume")
60+
// along the concat_dim for the purposes of tensor indexing. Each thread is
61+
// responsible for reading one item from this volume and writing it to the
62+
// appropriate output location.
63+
ivec4 inp_volume_sizes = out_sizes;
64+
inp_volume_sizes[concat_dim] = total_concat_dim_numel();
65+
66+
// Account for 0 size input tensors
67+
if (any(lessThanEqual(inp_volume_sizes, ivec4(0)))) {
68+
return;
69+
}
70+
71+
ivec4 inp_volume_tidx = nchwi_to_tidx(tid, inp_volume_sizes);
72+
73+
// bounds check
74+
if (any(greaterThanEqual(inp_volume_tidx, inp_volume_sizes))) {
4875
return;
4976
}
5077

51-
// Convert buffer linear index to 4-D tensor index for output
52-
const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
78+
int concat_offset = t_concat_offset[0];
79+
80+
ivec4 out_tidx = inp_volume_tidx;
81+
out_tidx[concat_dim] += concat_offset;
5382

54-
// Determine which input tensor to read from
55-
ivec4 in_tidx = out_tidx;
83+
const uint out_bufi = tidx_to_bufi(out_tidx, out_strides);
5684

85+
// Go through the list of input tensors, and find which input this output
86+
// element should be read from.
5787
$for i in range(NUM_INPUTS):
58-
// Check if the index at the concat dim is within bounds of the input tensor
59-
// If so, read from that input tensor and write to output
60-
if (in_tidx[concat_dim] < in${i+1}_sizes[concat_dim]) {
61-
int in_bufi = tidx_to_bufi(in_tidx, in${i+1}_strides);
62-
t_out[out_bufi] = t_in${i+1}[in_bufi];
88+
if (inp_volume_tidx[concat_dim] < inp${i}_sizes[concat_dim]) {
89+
int inp_bufi = tidx_to_bufi(inp_volume_tidx, inp${i}_strides);
90+
t_out[out_bufi] = t_inp${i}[inp_bufi];
6391
return;
6492
}
65-
// otherwise, decrement the index at the concat dim
6693
else {
67-
in_tidx[concat_dim] -= in${i+1}_sizes[concat_dim];
94+
inp_volume_tidx[concat_dim] -= inp${i}_sizes[concat_dim];
6895
}
6996
}

0 commit comments

Comments
 (0)