使用TVM优化blur算子 #154

linyv · 2022-09-23T07:08:18Z

linyv
Sep 23, 2022

好奇tvm在一些图像算子上优化效果，我从blur算法入手做了些尝试，并与halide进行了对比，有几点疑问想要请教：

使用auto_scheduler

@auto_scheduler.register_workload
def blur(M, N, dtype):
    # Algorithm
    k1 = te.reduce_axis((0, 3), "k1")
    k2 = te.reduce_axis((0, 3), "k2")
    A = te.placeholder((M, N), dtype=dtype, name="A")
    K = te.placeholder((1,), dtype=dtype, name="K")
    B = te.compute((M-8, N-2), lambda m, n: te.sum(A[m+k1, n+k2], axis=(k1, k2)), name="B")
    C = te.compute((M-8, N-2), lambda m, n: te.div(B[m, n], K[0]), name="C")
    return [A, K, C]

M=2568
N=1922

target = tvm.target.Target("llvm -mcpu=skylake-avx512")
dtype="uint16"

task_x = auto_scheduler.SearchTask(
    func=blur, args=(M, N, dtype), target=target
)

log_file = "blur.json"
measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300, timeout=600)
tune_option = auto_scheduler.TuningOptions(
    num_measure_trials=1000,  # change this to 1000 to achieve the best performance
    runner=measure_ctx.runner,
    measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    verbose=2,
)

task_x.tune(tune_option)
sch_x, args_x = task_x.apply_best(log_file)
del measure_ctx

import numpy as np
target = "llvm -mcpu=skylake-avx512"
dev = tvm.device(target, 0)

inp = tvm.nd.array(np.ones((M, N)).astype(dtype))
inp_k_np = np.ones((1)).astype(dtype)
inp_k_np[0] = 9
inp_k = tvm.nd.array(inp_k_np)
out = tvm.nd.array(np.zeros((M-8, N-2), dtype=dtype))

# import time
func_x = tvm.build(sch_x, args_x, target)
evaluator = func_x.time_evaluator(func_x.entry_name, dev, number=10)
print("after transformation: %f" % evaluator(inp, inp_k, out).mean)
print(tvm.lower(sch_x, args_x, simple_mode=True))

搜索出来的调度性能和预期差距较大，看搜索结果似乎并没有tile，vectorize等优化逻辑，两个block也应该可以采用compute_at的方式融合在一块，但是也没有，不知是哪里配置有问题？

@main = primfn(A_1: handle, K_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(uint8), uint8, [4935696], []),
             K: Buffer(K_2: Pointer(uint8), uint8, [1], []),
             C: Buffer(C_2: Pointer(uint8), uint8, [4915200], [])}
  buffer_map = {A_1: A, K_1: K, C_1: C}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, uint8, [2568, 1922], []), K_1: K_3: Buffer(K_2, uint8, [1], []), C_1: C_3: Buffer(C_2, uint8, [2560, 1920], [])} {
  for (m: int32, 0, 2560) "parallel" {
    allocate(B.rf: Pointer(global uint8), uint8, [1920]), storage_scope = global;
    allocate(B: Pointer(global uint8), uint8, [1]), storage_scope = global {
      for (n: int32, 0, 1920) {
        let cse_var_1: int32 = ((m*1922) + n)
         {
          B.rf_1: Buffer(B.rf, uint8, [1920], [])[n] = 0u8
          B.rf_1[n] = (B.rf_1[n] + A[cse_var_1])
          B.rf_1[n] = (B.rf_1[n] + A[(cse_var_1 + 1)])
          B.rf_1[n] = (B.rf_1[n] + A[(cse_var_1 + 2)])
          B.rf_1[n] = (B.rf_1[n] + A[(cse_var_1 + 1922)])
          B.rf_1[n] = (B.rf_1[n] + A[(cse_var_1 + 1923)])
          B.rf_1[n] = (B.rf_1[n] + A[(cse_var_1 + 1924)])
          B.rf_1[n] = (B.rf_1[n] + A[(cse_var_1 + 3844)])
          B.rf_1[n] = (B.rf_1[n] + A[(cse_var_1 + 3845)])
          B.rf_1[n] = (B.rf_1[n] + A[(cse_var_1 + 3846)])
        }
      }
      for (n_1: int32, 0, 1920) {
        B_1: Buffer(B, uint8, [1], [], align=1)[0] = 0u8
        B_1[0] = (B_1[0] + B.rf_1[n_1])
        C[((m*1920) + n_1)] = (B_1[0] / K[0])
      }
    }
  }
}

手动调整schedule
自动搜索无果后，我尝试了多种方法来手动对schedule调整，第一个困难是这两个逻辑会创造B, C两个buffer，实际只需要C一个就可以了，如何才能共用呢？

B = te.compute((M-8, N-2), lambda m, n: te.sum(A[m+k1, n+k2], axis=(k1, k2)), name="B")
C = te.compute((M-8, N-2), lambda m, n: te.div(B[m, n], K[0]), name="C")

我先忽略了共用buffer的问题，先进行一些schedule调整：

sch = tvm.tir.Schedule(ir_module)
block_b = sch.get_block("B")
bx, by, bk1, bk2 = sch.get_loops(block_b)

bxo, bxi = sch.split(bx, (None, 32))
byo, byi = sch.split(by, (None, 16))

block_c = sch.get_block("C")
cx, cy = sch.get_loops(block_c)
cxo, cxi = sch.split(cx, (None, 32))
cyo, cyi = sch.split(cy, (None, 16))

sch.compute_at(block_b, cyo)

block_b = sch.get_block("B")
xo, xi, yo, yi, bk1, bk2 = sch.get_loops(block_b)
sch.unroll(bk1)
sch.unroll(bk2)
sch.parallel(xo)
sch.vectorize(yi)

print(sch.mod.script())

经过上述调整，我们可以得到如下IRModule：

# from tvm.script import tir as T
@tvm.script.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer[(2568, 1922), "uint16"], K: T.Buffer[1, "uint16"], C: T.Buffer[(2560, 1920), "uint16"]) -> None:
        # function attr dict
        T.func_attr({"global_symbol": "main", "tir.noalias": True})
        # body
        # with T.block("root")
        B = T.alloc_buffer([2560, 1920], dtype="uint16")
        for i0_0 in T.parallel(80):
            for i0_1, i1_0 in T.grid(32, 120):
                for ax0 in T.vectorized(16):
                    for ax1 in T.unroll(3):
                        for ax2 in T.unroll(3):
                            with T.block("B"):
                                m = T.axis.spatial(2560, i0_0 * 32 + i0_1)
                                n = T.axis.spatial(1920, i1_0 * 16 + ax0)
                                k1, k2 = T.axis.remap("RR", [ax1, ax2])
                                T.reads(A[m + k1, n + k2])
                                T.writes(B[m, n])
                                with T.init():
                                    B[m, n] = T.uint16(0)
                                B[m, n] = B[m, n] + A[m + k1, n + k2]
                for i1_1 in T.serial(16):
                    with T.block("C"):
                        m = T.axis.spatial(2560, i0_0 * 32 + i0_1)
                        n = T.axis.spatial(1920, i1_0 * 16 + i1_1)
                        T.reads(B[m, n], K[0])
                        T.writes(C[m, n])
                        C[m, n] = B[m, n] / K[0]

于是我手动改写了该Module，将B buffer换成直接复用C buffer，有一些性能收益，但是不太多是这里本身就会有优化吗？目前与halide仍差距较大，分析了一下，发现T.block("C")中的除法是性能的瓶颈，请问有什么改进建议吗？

from tvm.script import tir as T
@tvm.script.ir_module
class BlurModule:
    @T.prim_func
    def main(A: T.Buffer[(2568, 1922), "uint16"], K: T.Buffer[1, "uint16"], C: T.Buffer[(2560, 1920), "uint16"]) -> None:
        # function attr dict
        T.func_attr({"global_symbol": "main", "tir.noalias": True})
        # body
        # with T.block("root")
        for i0_0 in T.parallel(80):
            for i0_1, i1_0 in T.grid(32, 120):
                for ax0 in T.vectorized(16):
                    for ax1 in T.unroll(3):
                        for ax2 in T.unroll(3):
                            with T.block("B"):
                                m = T.axis.spatial(2560, i0_0 * 32 + i0_1)
                                n = T.axis.spatial(1920, i1_0 * 16 + ax0)
                                k1, k2 = T.axis.remap("RR", [ax1, ax2])
                                T.reads(A[m + k1, n + k2])
                                T.writes(C[m, n])
                                with T.init():
                                    C[m, n] = T.uint16(0)
                                C[m, n] = C[m, n] + A[m + k1, n + k2]
                for i1_1 in T.vectorized(16):
                    with T.block("C"):
                        m = T.axis.spatial(2560, i0_0 * 32 + i0_1)
                        n = T.axis.spatial(1920, i1_0 * 16 + i1_1)
                        T.reads(C[m, n], K[0])
                        T.writes(C[m, n])
                        C[m, n] = C[m, n] / K[0]

linyv · 2022-09-23T07:09:05Z

linyv
Sep 23, 2022
Author

类似的图像算子优化是tvm的设计目标吗？

0 replies

linyv · 2022-09-23T08:43:03Z

linyv
Sep 23, 2022
Author

现在我发现了一个优化点，由于之前在写compute时有一个除法操作，我的输入是uint16（主要是和halide的实现保持一致，对比公平），而compute中无法指定一个uint16的数字（现在我知道了可以用tvm.tir.const），所以采用的方式是传入一个K buffer，让K保存除数来完成compute的编写，而得到的IRModule中，其实可以直接将K Buffer用T.uint16(9)来代替，此操作会带来性能的大幅提高，超过了halide中手写的simd_blur，但离halide的生成代码还有差距，三者的耗时表现大约是tvm_blur->0.0006s，simd_blur->0.0008s，halide_blur->0.0005s。

@tvm.script.ir_module
class Module:
    @T.prim_func
    def main(A: T.Buffer[(2568, 1922), "uint16"], C: T.Buffer[(2560, 1920), "uint16"]) -> None:
        # function attr dict
        T.func_attr({"global_symbol": "main", "tir.noalias": True})
        # body
        # with T.block("root")
        for i0_0 in T.parallel(80):
            for i0_1, i1_0 in T.grid(32, 120):
                for ax0 in T.vectorized(16):
                    for ax1 in T.unroll(3):
                        for ax2 in T.unroll(3):
                            with T.block("C"):
                                m = T.axis.spatial(2560, i0_0 * 32 + i0_1)
                                n = T.axis.spatial(1920, i1_0 * 16 + ax0)
                                k1, k2 = T.axis.remap("RR", [ax1, ax2])
                                T.reads(A[m + k1, n + k2])
                                T.writes(C[m, n])
                                with T.init():
                                    C[m, n] = T.uint16(0)
                                C[m, n] = C[m, n] + A[m + k1, n + k2]
                for i1_1 in T.vectorized(16):
                    with T.block("C"):
                        m = T.axis.spatial(2560, i0_0 * 32 + i0_1)
                        n = T.axis.spatial(1920, i1_0 * 16 + i1_1)
                        T.reads(C[m, n])
                        T.writes(C[m, n])
                        C[m, n] = C[m, n] / T.uint16(9)

1 reply

yzh119 Oct 2, 2022
Maintainer

init block可以通过decompose_init操作提出来，避免产生多余的branch。

linyv · 2022-09-23T12:32:54Z

linyv
Sep 23, 2022
Author

将blur算子描述改成如下的样子，再次搜索，性能约0.0007s

@auto_scheduler.register_workload
def blur(M, N, dtype):
    # Algorithm
    k1 = te.reduce_axis((0, 3), "k1")
    k2 = te.reduce_axis((0, 3), "k2")
    A = te.placeholder((M, N), dtype=dtype, name="A")
    B = te.compute((M-8, N-2), lambda m, n: te.sum(A[m+k1, n+k2], axis=(k1, k2)), name="B")
    C = te.compute((M-8, N-2), lambda m, n: te.div(B[m, n], tvm.tir.const(9, dtype=dtype)), name="C")
    return [A, C]

搜索得到的策略为：

@main = primfn(A_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(uint16), uint16, [4935696], []),
             C: Buffer(C_2: Pointer(uint16), uint16, [4915200], [])}
  buffer_map = {A_1: A, C_1: C}
  preflattened_buffer_map = {A_1: A_3: Buffer(A_2, uint16, [2568, 1922], []), C_1: C_3: Buffer(C_2, uint16, [2560, 1920], [])} {
  for (m: int32, 0, 2560) "parallel" {
    allocate(B.rf: Pointer(global uint16), uint16, [1920]), storage_scope = global;
    allocate(B: Pointer(global uint16), uint16, [1]), storage_scope = global {
      for (n: int32, 0, 1920) {
        let cse_var_1: int32 = ((m*1922) + n)
         {
          B.rf_1: Buffer(B.rf, uint16, [1920], [])[n] = 0u16
          B.rf_1[n] = (B.rf_1[n] + A[cse_var_1])
          B.rf_1[n] = (B.rf_1[n] + A[(cse_var_1 + 1)])
          B.rf_1[n] = (B.rf_1[n] + A[(cse_var_1 + 2)])
          B.rf_1[n] = (B.rf_1[n] + A[(cse_var_1 + 1922)])
          B.rf_1[n] = (B.rf_1[n] + A[(cse_var_1 + 1923)])
          B.rf_1[n] = (B.rf_1[n] + A[(cse_var_1 + 1924)])
          B.rf_1[n] = (B.rf_1[n] + A[(cse_var_1 + 3844)])
          B.rf_1[n] = (B.rf_1[n] + A[(cse_var_1 + 3845)])
          B.rf_1[n] = (B.rf_1[n] + A[(cse_var_1 + 3846)])
        }
      }
      for (n_1: int32, 0, 1920) {
        B_1: Buffer(B, uint16, [1], [], align=2)[0] = 0u16
        B_1[0] = (B_1[0] + B.rf_1[n_1])
        C[((m*1920) + n_1)] = (B_1[0] / 9u16)
      }
    }
  }
}

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

使用TVM优化blur算子 #154

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 3 comments 1 reply

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Select a reply

Uh oh!

使用TVM优化blur算子 #154

Uh oh!

Uh oh!

linyv Sep 23, 2022

Replies: 3 comments · 1 reply

Uh oh!

linyv Sep 23, 2022 Author

Uh oh!

Uh oh!

linyv Sep 23, 2022 Author

Uh oh!

yzh119 Oct 2, 2022 Maintainer

Uh oh!

Uh oh!

linyv Sep 23, 2022 Author

linyv
Sep 23, 2022

Replies: 3 comments 1 reply

linyv
Sep 23, 2022
Author

linyv
Sep 23, 2022
Author

yzh119 Oct 2, 2022
Maintainer

linyv
Sep 23, 2022
Author