Add mask load/store function for partial vector load/store

flyfish30 · flyfish30 · commit e2447becc81a · 2024-04-21T19:06:14.000+08:00
diff --git a/src/main.zig b/src/main.zig
@@ -4,9 +4,13 @@ const img = @import("image_processing.zig");
 const sd = @import("simd_sample.zig");
 const simd = @import("simd_core.zig");
 const bisort = @import("bitonic_sort.zig");
+const vqsort = @import("vec_qsort.zig");
 
 const Allocator = std.mem.Allocator;
 
+const VecLen = simd.VecLen;
+const VecType = simd.VecType;
+
 // export fn _start() callconv(.C) noreturn {
 //     try @call(.auto, main, .{});
 // }
@@ -21,6 +25,7 @@ pub fn main() !void {
     try sd.simdSample();
 
     bitonicSortSample();
+    vqsortSample();
 
     if (std.os.argv.len > 1) {
         try img.readAndProcessImage(std.mem.span(std.os.argv[1]));
@@ -41,3 +46,18 @@ fn bitonicSortSample() void {
     std.debug.print("sorted vec_int is: {any}\n", .{vec_int});
     return;
 }
+
+fn vqsortSample() void {
+    const IntType = u32;
+    var prnd = std.rand.DefaultPrng.init(83751737);
+    var array_int: [VecLen(IntType)]IntType = undefined;
+    for (&array_int) |*a| {
+        a.* = prnd.random().int(IntType);
+    }
+    array_int[VecLen(IntType) - 1] = 5;
+    std.debug.print("original array_int is: {any}\n", .{array_int});
+
+    vqsort.vqsort(IntType, array_int[0 .. VecLen(IntType) - 1]);
+    std.debug.print("vqsort array_int is: {any}\n", .{array_int});
+    return;
+}
diff --git a/src/pack_select.zig b/src/pack_select.zig
@@ -100,7 +100,7 @@ fn packSelectVec128(vec: anytype, mask: @Vector(vectorLength(@TypeOf(vec)), bool
     return vec;
 }
 
-const table8x16: [256 * 8]u8 align(8) = table_indices: {
+const table8x16: [256 * 8]u8 align(16) = table_indices: {
     comptime var indices: @Vector(256 * 8, u8) = table16x8[0 .. 256 * 8].*;
     indices /= @splat(2);
     break :table_indices @bitCast(indices);
diff --git a/src/simd_aarch64.zig b/src/simd_aarch64.zig
@@ -1,5 +1,6 @@
 const std = @import("std");
 const simd = @import("simd_core.zig");
+const simdg = @import("simd_generic.zig");
 
 const target = @import("builtin").target;
 const arch = target.cpu.arch;
@@ -35,6 +36,18 @@ pub const SimdSamples = struct {
     }
 };
 
+pub fn maskedLoadVecOr(comptime T: type, val_vec: @Vector(VecLen(T), T), mask: @Vector(VecLen(T), bool), buf: []T) @Vector(VecLen(T), T) {
+    return simdg.maskedLoadVecOr(T, val_vec, mask, buf);
+}
+
+pub fn maskedLoadVec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T) @Vector(VecLen(T), T) {
+    return simdg.maskedLoadVec(T, mask, buf);
+}
+
+pub fn maskedStoreVec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T, vec: @Vector(VecLen(T), T)) void {
+    simdg.maskedStoreVec(T, mask, buf, vec);
+}
+
 inline fn neon_shuffle_u8(vec: @Vector(VecLen(u8), u8), idx: @Vector(VecLen(i8), i8)) @TypeOf(vec) {
     const neon_idx: @Vector(VecLen(u8), u8) = @bitCast(idx);
     return asm ("tbl.16b %[ret], { %[v0] }, %[v1]"
diff --git a/src/simd_core.zig b/src/simd_core.zig
@@ -43,6 +43,21 @@ pub fn VecChild(comptime T: type) type {
     return std.meta.Child(T);
 }
 
+pub fn isBitsPackedLeft(int_mask: anytype) bool {
+    const info = @typeInfo(@TypeOf(int_mask));
+    if (!(info == .Int or
+        info == .Comptime_Int))
+    {
+        @compileError("The int_mask not a int type");
+    }
+
+    // check all bits of mask is packed left, as bellow
+    //    lsb ..             msb
+    //  [ 1, 1, .. 1, 0, 0, .. 0 ]
+    const isPackedLeft: bool = int_mask & (~(int_mask << 1)) == 0x1;
+    return isPackedLeft;
+}
+
 /// Given a bitmask, will return a mask where the bits are filled in between.
 /// It is just reduce bits with XOR bit operator.
 /// On modern x86 and aarch64 CPU's, it should have a latency of 3 and a throughput of 1.
diff --git a/src/simd_generic.zig b/src/simd_generic.zig
@@ -7,17 +7,83 @@ const VecType = simd.VecType;
 const target = @import("builtin").target;
 const arch = target.cpu.arch;
 
-const c = @cImport(
-    @cInclude("arm_neon.h"),
-);
-
 pub const SimdSamples = struct {
     pub fn binOpI16x8(vec1: simd.I16x8, vec2: simd.I16x8) simd.I16x8 {
         const acc = vec1 * vec2;
         return acc;
     }
 };
 
+fn CopyPtrAttrs(
+    comptime source: type,
+    comptime size: std.builtin.Type.Pointer.Size,
+    comptime child: type,
+) type {
+    const info = @typeInfo(source).Pointer;
+    return @Type(.{
+        .Pointer = .{
+            .size = size,
+            .is_const = info.is_const,
+            .is_volatile = info.is_volatile,
+            .is_allowzero = info.is_allowzero,
+            .alignment = info.alignment,
+            .address_space = info.address_space,
+            .child = child,
+            .sentinel = null,
+        },
+    });
+}
+
+fn AsArrayReturnType(comptime T: type, comptime P: type) type {
+    const size = @sizeOf(std.meta.Child(P));
+    return CopyPtrAttrs(P, .One, [size / @sizeOf(T)]T);
+}
+
+/// Given a pointer to a single item, returns a slice of the underlying type, preserving pointer attributes.
+pub fn asArray(comptime T: type, ptr: anytype) AsArrayReturnType(T, @TypeOf(ptr)) {
+    return @ptrCast(@alignCast(ptr));
+}
+
+pub fn maskedLoadVecOr(comptime T: type, val_vec: @Vector(VecLen(T), T), mask: @Vector(VecLen(T), bool), buf: []T) @Vector(VecLen(T), T) {
+    return @select(T, mask, maskedLoadPartVec(T, mask, buf), val_vec);
+}
+
+pub fn maskedLoadVec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T) @Vector(VecLen(T), T) {
+    var zero_vec: @Vector(VecLen(T), T) = @splat(0);
+    return @select(T, mask, maskedLoadPartVec(T, mask, buf), zero_vec);
+}
+
+// only load partial vector from buf
+inline fn maskedLoadPartVec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T) @Vector(VecLen(T), T) {
+    var vec: @Vector(VecLen(T), T) = undefined;
+
+    const int_mask = @as(std.meta.Int(.unsigned, VecLen(T)), @bitCast(mask));
+    const load_len = VecLen(T) - @clz(int_mask);
+    var array = asArray(T, &vec);
+    @memcpy(array[0..load_len], buf);
+    return vec;
+}
+
+pub fn maskedStoreVec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T, vec: @Vector(VecLen(T), T)) void {
+    const int_mask = @as(std.meta.Int(.unsigned, VecLen(T)), @bitCast(mask));
+    const store_len = VecLen(T) - @clz(int_mask);
+    if (simd.isBitsPackedLeft(int_mask)) {
+        // all bits of mask is packed left
+        //    lsb ..             msb
+        //  [ 1, 1, .. 1, 0, 0, .. 0 ]
+        var array = asArray(T, &vec);
+        @memcpy(buf, array[0..store_len]);
+        return;
+    }
+
+    var origin_vec: @Vector(VecLen(T), T) = undefined;
+    var origin_arr = asArray(T, &origin_vec);
+    @memcpy(origin_arr[0..store_len], buf);
+    var blended_vec = @select(T, mask, vec, origin_vec);
+    var blended_arr = asArray(T, &blended_vec);
+    @memcpy(buf, blended_arr[0..store_len]);
+}
+
 pub fn tableLookupBytes(tbl: @Vector(VecLen(u8), u8), idx: @Vector(VecLen(i8), i8)) @Vector(VecLen(u8), u8) {
     comptime var i = 0;
     var out_vec: @Vector(VecLen(u8), u8) = undefined;
diff --git a/src/simd_x86_64.zig b/src/simd_x86_64.zig
@@ -4,6 +4,7 @@ const target = @import("builtin").target;
 const arch = target.cpu.arch;
 
 const simd = @import("simd_core.zig");
+const simdg = @import("simd_generic.zig");
 
 const VEC_BITS_LEN = simd.VEC_BITS_LEN;
 const VecLen = simd.VecLen;
@@ -17,13 +18,88 @@ const c = @cImport({
     @cInclude("x86_64_intrins.h");
 });
 
+fn hasAvx2() bool {
+    if (arch == .x86_64) {
+        const hasFeature = std.Target.x86.featureSetHas;
+        return hasFeature(target.cpu.features, .avx2);
+    }
+
+    return false;
+}
+
 pub const SimdSamples = struct {
     pub fn binOpI16x8(vec1: simd.I16x8, vec2: simd.I16x8) simd.I16x8 {
         const acc = c._mm_mullo_epi16(@bitCast(vec1), @bitCast(vec2));
         return @bitCast(acc);
     }
 };
 
+inline fn mm_maskload_vec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T) @Vector(VecLen(T), T) {
+    const VecBitsInt = std.meta.Int(.unsigned, VEC_BITS_LEN);
+    const mm_buf: @Vector(VecLen(i64), i64) = @bitCast(@as(VecBitsInt, @intFromPtr(&buf)));
+
+    const all_zeros: @Vector(VecLen(T), T) = @splat(0x0);
+    const all_ones = ~all_zeros;
+    const t_mask = @select(T, mask, all_ones, all_zeros);
+    const mm_mask: @Vector(VecLen(i64), i64) = @bitCast(t_mask);
+    switch (@sizeOf(T)) {
+        32, 64, 128 => {
+            return asm ("vpmaskmovd %[result], %[mask], %[addr]"
+                : [result] "=x" (-> @Vector(VecLen(T), T)),
+                : [mask] "x" (mm_mask),
+                  [addr] "x" (mm_buf),
+            );
+        },
+        else => @compileError("Not support type " ++ @typeName(T)),
+    }
+}
+
+pub fn mm_maskstore_vec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T, vec: @Vector(VecLen(T), T)) void {
+    const VecBitsInt = std.meta.Int(.unsigned, VEC_BITS_LEN);
+    const mm_vec: @Vector(VecLen(i64), i64) = @bitCast(vec);
+    const mm_buf: @Vector(VecLen(i64), i64) = @bitCast(@as(VecBitsInt, @intFromPtr(&buf)));
+
+    const all_zeros: @Vector(VecLen(T), T) = @splat(0x0);
+    const all_ones = ~all_zeros;
+    const t_mask = @select(T, mask, all_ones, all_zeros);
+    const mm_mask: @Vector(VecLen(i64), i64) = @bitCast(t_mask);
+    switch (@sizeOf(T)) {
+        32, 64, 128 => {
+            asm ("vpmaskmovd %[addr], %[mask], %[vec]"
+                : [addr] "=x" (mm_buf),
+                : [mask] "x" (mm_mask),
+                  [vec] "x" (mm_vec),
+            );
+        },
+        else => @compileError("Not support type " ++ @typeName(T)),
+    }
+}
+
+pub fn maskedLoadVecOr(comptime T: type, val_vec: @Vector(VecLen(T), T), mask: @Vector(VecLen(T), bool), buf: []T) @Vector(VecLen(T), T) {
+    if (comptime hasAvx2() and @sizeOf(T) >= 32) {
+        const vec = mm_maskload_vec(T, mask, buf);
+        return @select(T, mask, vec, val_vec);
+    } else {
+        return simdg.maskedLoadVecOr(T, val_vec, mask, buf);
+    }
+}
+
+pub fn maskedLoadVec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T) @Vector(VecLen(T), T) {
+    if (comptime hasAvx2() and @sizeOf(T) >= 32) {
+        return mm_maskload_vec(T, mask, buf);
+    } else {
+        return simdg.maskedLoadVec(T, mask, buf);
+    }
+}
+
+pub fn maskedStoreVec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T, vec: @Vector(VecLen(T), T)) void {
+    if (comptime hasAvx2() and @sizeOf(T) >= 32) {
+        return mm_maskstore_vec(T, mask, buf, vec);
+    } else {
+        return simdg.maskedStoreVec(T, mask, buf, vec);
+    }
+}
+
 inline fn mm_shuffle_u8(vec: @Vector(VecLen(u8), u8), idx: @Vector(VecLen(i8), i8)) @TypeOf(vec) {
     const mm_vec: @Vector(VecLen(i64), i64) = @bitCast(vec);
     const mm_idx: @Vector(VecLen(i64), i64) = @bitCast(idx);
diff --git a/src/vec_qsort.zig b/src/vec_qsort.zig
@@ -1,18 +1,46 @@
 const std = @import("std");
 const builtin = @import("builtin");
+const bisort = @import("bitonic_sort.zig");
+const simd = @import("simd_core.zig");
 
-fn vqsort()
-{
+const VEC_BITS_LEN = simd.VEC_BITS_LEN;
+const VecLen = simd.VecLen;
+const VecType = simd.VecType;
+const vectorLength = simd.vectorLength;
+const VecChild = simd.VecChild;
+
+pub fn vqsort(comptime T: type, buf: []T) void {
+    const maxLevels: usize = 60;
+    doVecQSort(T, buf, maxLevels);
 }
 
-fn doVecQsort()
-{
+fn doVecQSort(comptime T: type, buf: []T, remLevels: usize) void {
+    if (buf.len <= VecLen(T)) {
+        const asc_idx = std.simd.iota(usize, VecLen(T));
+        const mask = asc_idx < @as(@Vector(VecLen(T), usize), @splat(buf.len));
+        const pad = switch (@typeInfo(T)) {
+            .Int, .ComptimeInt => std.math.maxInt(T),
+            .Float, .ComptimeFloat => std.math.floatMax,
+            else => @compileError("bad type"),
+        };
+        const pad_vec: @Vector(VecLen(T), T) = @splat(pad);
+        var vec: @Vector(VecLen(T), T) = simd.maskedLoadVecOr(T, pad_vec, mask, buf);
+        vec = bisort.bitonicSort1V(T, vec);
+        simd.maskedStoreVec(T, mask, buf, vec);
+        return;
+    }
+
+    const pivot = getPivot(T, buf);
+    const mid = partition(T, buf, pivot);
+    doVecQSort(T, buf[0..mid], remLevels - 1);
+    doVecQSort(T, buf[mid + 1 .. buf.len], remLevels - 1);
 }
 
-fn partition()
-{
+fn partition(comptime T: type, buf: []T, pivot: T) usize {
+    _ = pivot;
+    return (buf.len / 2);
 }
 
-fn choosePivot()
-{
+fn getPivot(comptime T: type, buf: []T) T {
+    return buf[0];
 }

Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ fn packSelectVec128(vec: anytype, mask: @Vector(vectorLength(@TypeOf(vec)), bool`
`100`	`100`	`return vec;`
`101`	`101`	`}`
`102`	`102`
`103`		`-const table8x16: [256 * 8]u8 align(8) = table_indices: {`
	`103`	`+const table8x16: [256 * 8]u8 align(16) = table_indices: {`
`104`	`104`	`comptime var indices: @Vector(256 * 8, u8) = table16x8[0 .. 256 * 8].*;`
`105`	`105`	`indices /= @splat(2);`
`106`	`106`	`break :table_indices @bitCast(indices);`