Skip to content

Commit e2447be

Browse files
committed
Add mask load/store function for partial vector load/store
1 parent ccfd3d6 commit e2447be

File tree

7 files changed

+231
-13
lines changed

7 files changed

+231
-13
lines changed

src/main.zig

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,13 @@ const img = @import("image_processing.zig");
44
const sd = @import("simd_sample.zig");
55
const simd = @import("simd_core.zig");
66
const bisort = @import("bitonic_sort.zig");
7+
const vqsort = @import("vec_qsort.zig");
78

89
const Allocator = std.mem.Allocator;
910

11+
const VecLen = simd.VecLen;
12+
const VecType = simd.VecType;
13+
1014
// export fn _start() callconv(.C) noreturn {
1115
// try @call(.auto, main, .{});
1216
// }
@@ -21,6 +25,7 @@ pub fn main() !void {
2125
try sd.simdSample();
2226

2327
bitonicSortSample();
28+
vqsortSample();
2429

2530
if (std.os.argv.len > 1) {
2631
try img.readAndProcessImage(std.mem.span(std.os.argv[1]));
@@ -41,3 +46,18 @@ fn bitonicSortSample() void {
4146
std.debug.print("sorted vec_int is: {any}\n", .{vec_int});
4247
return;
4348
}
49+
50+
fn vqsortSample() void {
51+
const IntType = u32;
52+
var prnd = std.rand.DefaultPrng.init(83751737);
53+
var array_int: [VecLen(IntType)]IntType = undefined;
54+
for (&array_int) |*a| {
55+
a.* = prnd.random().int(IntType);
56+
}
57+
array_int[VecLen(IntType) - 1] = 5;
58+
std.debug.print("original array_int is: {any}\n", .{array_int});
59+
60+
vqsort.vqsort(IntType, array_int[0 .. VecLen(IntType) - 1]);
61+
std.debug.print("vqsort array_int is: {any}\n", .{array_int});
62+
return;
63+
}

src/pack_select.zig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ fn packSelectVec128(vec: anytype, mask: @Vector(vectorLength(@TypeOf(vec)), bool
100100
return vec;
101101
}
102102

103-
const table8x16: [256 * 8]u8 align(8) = table_indices: {
103+
const table8x16: [256 * 8]u8 align(16) = table_indices: {
104104
comptime var indices: @Vector(256 * 8, u8) = table16x8[0 .. 256 * 8].*;
105105
indices /= @splat(2);
106106
break :table_indices @bitCast(indices);

src/simd_aarch64.zig

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
const std = @import("std");
22
const simd = @import("simd_core.zig");
3+
const simdg = @import("simd_generic.zig");
34

45
const target = @import("builtin").target;
56
const arch = target.cpu.arch;
@@ -35,6 +36,18 @@ pub const SimdSamples = struct {
3536
}
3637
};
3738

39+
pub fn maskedLoadVecOr(comptime T: type, val_vec: @Vector(VecLen(T), T), mask: @Vector(VecLen(T), bool), buf: []T) @Vector(VecLen(T), T) {
40+
return simdg.maskedLoadVecOr(T, val_vec, mask, buf);
41+
}
42+
43+
pub fn maskedLoadVec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T) @Vector(VecLen(T), T) {
44+
return simdg.maskedLoadVec(T, mask, buf);
45+
}
46+
47+
pub fn maskedStoreVec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T, vec: @Vector(VecLen(T), T)) void {
48+
simdg.maskedStoreVec(T, mask, buf, vec);
49+
}
50+
3851
inline fn neon_shuffle_u8(vec: @Vector(VecLen(u8), u8), idx: @Vector(VecLen(i8), i8)) @TypeOf(vec) {
3952
const neon_idx: @Vector(VecLen(u8), u8) = @bitCast(idx);
4053
return asm ("tbl.16b %[ret], { %[v0] }, %[v1]"

src/simd_core.zig

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,21 @@ pub fn VecChild(comptime T: type) type {
4343
return std.meta.Child(T);
4444
}
4545

46+
pub fn isBitsPackedLeft(int_mask: anytype) bool {
47+
const info = @typeInfo(@TypeOf(int_mask));
48+
if (!(info == .Int or
49+
info == .Comptime_Int))
50+
{
51+
@compileError("The int_mask not a int type");
52+
}
53+
54+
// check all bits of mask is packed left, as bellow
55+
// lsb .. msb
56+
// [ 1, 1, .. 1, 0, 0, .. 0 ]
57+
const isPackedLeft: bool = int_mask & (~(int_mask << 1)) == 0x1;
58+
return isPackedLeft;
59+
}
60+
4661
/// Given a bitmask, will return a mask where the bits are filled in between.
4762
/// It is just reduce bits with XOR bit operator.
4863
/// On modern x86 and aarch64 CPU's, it should have a latency of 3 and a throughput of 1.

src/simd_generic.zig

Lines changed: 70 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,83 @@ const VecType = simd.VecType;
77
const target = @import("builtin").target;
88
const arch = target.cpu.arch;
99

10-
const c = @cImport(
11-
@cInclude("arm_neon.h"),
12-
);
13-
1410
pub const SimdSamples = struct {
1511
pub fn binOpI16x8(vec1: simd.I16x8, vec2: simd.I16x8) simd.I16x8 {
1612
const acc = vec1 * vec2;
1713
return acc;
1814
}
1915
};
2016

17+
fn CopyPtrAttrs(
18+
comptime source: type,
19+
comptime size: std.builtin.Type.Pointer.Size,
20+
comptime child: type,
21+
) type {
22+
const info = @typeInfo(source).Pointer;
23+
return @Type(.{
24+
.Pointer = .{
25+
.size = size,
26+
.is_const = info.is_const,
27+
.is_volatile = info.is_volatile,
28+
.is_allowzero = info.is_allowzero,
29+
.alignment = info.alignment,
30+
.address_space = info.address_space,
31+
.child = child,
32+
.sentinel = null,
33+
},
34+
});
35+
}
36+
37+
fn AsArrayReturnType(comptime T: type, comptime P: type) type {
38+
const size = @sizeOf(std.meta.Child(P));
39+
return CopyPtrAttrs(P, .One, [size / @sizeOf(T)]T);
40+
}
41+
42+
/// Given a pointer to a single item, returns a slice of the underlying type, preserving pointer attributes.
43+
pub fn asArray(comptime T: type, ptr: anytype) AsArrayReturnType(T, @TypeOf(ptr)) {
44+
return @ptrCast(@alignCast(ptr));
45+
}
46+
47+
pub fn maskedLoadVecOr(comptime T: type, val_vec: @Vector(VecLen(T), T), mask: @Vector(VecLen(T), bool), buf: []T) @Vector(VecLen(T), T) {
48+
return @select(T, mask, maskedLoadPartVec(T, mask, buf), val_vec);
49+
}
50+
51+
pub fn maskedLoadVec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T) @Vector(VecLen(T), T) {
52+
var zero_vec: @Vector(VecLen(T), T) = @splat(0);
53+
return @select(T, mask, maskedLoadPartVec(T, mask, buf), zero_vec);
54+
}
55+
56+
// only load partial vector from buf
57+
inline fn maskedLoadPartVec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T) @Vector(VecLen(T), T) {
58+
var vec: @Vector(VecLen(T), T) = undefined;
59+
60+
const int_mask = @as(std.meta.Int(.unsigned, VecLen(T)), @bitCast(mask));
61+
const load_len = VecLen(T) - @clz(int_mask);
62+
var array = asArray(T, &vec);
63+
@memcpy(array[0..load_len], buf);
64+
return vec;
65+
}
66+
67+
pub fn maskedStoreVec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T, vec: @Vector(VecLen(T), T)) void {
68+
const int_mask = @as(std.meta.Int(.unsigned, VecLen(T)), @bitCast(mask));
69+
const store_len = VecLen(T) - @clz(int_mask);
70+
if (simd.isBitsPackedLeft(int_mask)) {
71+
// all bits of mask is packed left
72+
// lsb .. msb
73+
// [ 1, 1, .. 1, 0, 0, .. 0 ]
74+
var array = asArray(T, &vec);
75+
@memcpy(buf, array[0..store_len]);
76+
return;
77+
}
78+
79+
var origin_vec: @Vector(VecLen(T), T) = undefined;
80+
var origin_arr = asArray(T, &origin_vec);
81+
@memcpy(origin_arr[0..store_len], buf);
82+
var blended_vec = @select(T, mask, vec, origin_vec);
83+
var blended_arr = asArray(T, &blended_vec);
84+
@memcpy(buf, blended_arr[0..store_len]);
85+
}
86+
2187
pub fn tableLookupBytes(tbl: @Vector(VecLen(u8), u8), idx: @Vector(VecLen(i8), i8)) @Vector(VecLen(u8), u8) {
2288
comptime var i = 0;
2389
var out_vec: @Vector(VecLen(u8), u8) = undefined;

src/simd_x86_64.zig

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ const target = @import("builtin").target;
44
const arch = target.cpu.arch;
55

66
const simd = @import("simd_core.zig");
7+
const simdg = @import("simd_generic.zig");
78

89
const VEC_BITS_LEN = simd.VEC_BITS_LEN;
910
const VecLen = simd.VecLen;
@@ -17,13 +18,88 @@ const c = @cImport({
1718
@cInclude("x86_64_intrins.h");
1819
});
1920

21+
fn hasAvx2() bool {
22+
if (arch == .x86_64) {
23+
const hasFeature = std.Target.x86.featureSetHas;
24+
return hasFeature(target.cpu.features, .avx2);
25+
}
26+
27+
return false;
28+
}
29+
2030
pub const SimdSamples = struct {
2131
pub fn binOpI16x8(vec1: simd.I16x8, vec2: simd.I16x8) simd.I16x8 {
2232
const acc = c._mm_mullo_epi16(@bitCast(vec1), @bitCast(vec2));
2333
return @bitCast(acc);
2434
}
2535
};
2636

37+
inline fn mm_maskload_vec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T) @Vector(VecLen(T), T) {
38+
const VecBitsInt = std.meta.Int(.unsigned, VEC_BITS_LEN);
39+
const mm_buf: @Vector(VecLen(i64), i64) = @bitCast(@as(VecBitsInt, @intFromPtr(&buf)));
40+
41+
const all_zeros: @Vector(VecLen(T), T) = @splat(0x0);
42+
const all_ones = ~all_zeros;
43+
const t_mask = @select(T, mask, all_ones, all_zeros);
44+
const mm_mask: @Vector(VecLen(i64), i64) = @bitCast(t_mask);
45+
switch (@sizeOf(T)) {
46+
32, 64, 128 => {
47+
return asm ("vpmaskmovd %[result], %[mask], %[addr]"
48+
: [result] "=x" (-> @Vector(VecLen(T), T)),
49+
: [mask] "x" (mm_mask),
50+
[addr] "x" (mm_buf),
51+
);
52+
},
53+
else => @compileError("Not support type " ++ @typeName(T)),
54+
}
55+
}
56+
57+
pub fn mm_maskstore_vec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T, vec: @Vector(VecLen(T), T)) void {
58+
const VecBitsInt = std.meta.Int(.unsigned, VEC_BITS_LEN);
59+
const mm_vec: @Vector(VecLen(i64), i64) = @bitCast(vec);
60+
const mm_buf: @Vector(VecLen(i64), i64) = @bitCast(@as(VecBitsInt, @intFromPtr(&buf)));
61+
62+
const all_zeros: @Vector(VecLen(T), T) = @splat(0x0);
63+
const all_ones = ~all_zeros;
64+
const t_mask = @select(T, mask, all_ones, all_zeros);
65+
const mm_mask: @Vector(VecLen(i64), i64) = @bitCast(t_mask);
66+
switch (@sizeOf(T)) {
67+
32, 64, 128 => {
68+
asm ("vpmaskmovd %[addr], %[mask], %[vec]"
69+
: [addr] "=x" (mm_buf),
70+
: [mask] "x" (mm_mask),
71+
[vec] "x" (mm_vec),
72+
);
73+
},
74+
else => @compileError("Not support type " ++ @typeName(T)),
75+
}
76+
}
77+
78+
pub fn maskedLoadVecOr(comptime T: type, val_vec: @Vector(VecLen(T), T), mask: @Vector(VecLen(T), bool), buf: []T) @Vector(VecLen(T), T) {
79+
if (comptime hasAvx2() and @sizeOf(T) >= 32) {
80+
const vec = mm_maskload_vec(T, mask, buf);
81+
return @select(T, mask, vec, val_vec);
82+
} else {
83+
return simdg.maskedLoadVecOr(T, val_vec, mask, buf);
84+
}
85+
}
86+
87+
pub fn maskedLoadVec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T) @Vector(VecLen(T), T) {
88+
if (comptime hasAvx2() and @sizeOf(T) >= 32) {
89+
return mm_maskload_vec(T, mask, buf);
90+
} else {
91+
return simdg.maskedLoadVec(T, mask, buf);
92+
}
93+
}
94+
95+
pub fn maskedStoreVec(comptime T: type, mask: @Vector(VecLen(T), bool), buf: []T, vec: @Vector(VecLen(T), T)) void {
96+
if (comptime hasAvx2() and @sizeOf(T) >= 32) {
97+
return mm_maskstore_vec(T, mask, buf, vec);
98+
} else {
99+
return simdg.maskedStoreVec(T, mask, buf, vec);
100+
}
101+
}
102+
27103
inline fn mm_shuffle_u8(vec: @Vector(VecLen(u8), u8), idx: @Vector(VecLen(i8), i8)) @TypeOf(vec) {
28104
const mm_vec: @Vector(VecLen(i64), i64) = @bitCast(vec);
29105
const mm_idx: @Vector(VecLen(i64), i64) = @bitCast(idx);

src/vec_qsort.zig

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,46 @@
11
const std = @import("std");
22
const builtin = @import("builtin");
3+
const bisort = @import("bitonic_sort.zig");
4+
const simd = @import("simd_core.zig");
35

4-
fn vqsort()
5-
{
6+
const VEC_BITS_LEN = simd.VEC_BITS_LEN;
7+
const VecLen = simd.VecLen;
8+
const VecType = simd.VecType;
9+
const vectorLength = simd.vectorLength;
10+
const VecChild = simd.VecChild;
11+
12+
pub fn vqsort(comptime T: type, buf: []T) void {
13+
const maxLevels: usize = 60;
14+
doVecQSort(T, buf, maxLevels);
615
}
716

8-
fn doVecQsort()
9-
{
17+
fn doVecQSort(comptime T: type, buf: []T, remLevels: usize) void {
18+
if (buf.len <= VecLen(T)) {
19+
const asc_idx = std.simd.iota(usize, VecLen(T));
20+
const mask = asc_idx < @as(@Vector(VecLen(T), usize), @splat(buf.len));
21+
const pad = switch (@typeInfo(T)) {
22+
.Int, .ComptimeInt => std.math.maxInt(T),
23+
.Float, .ComptimeFloat => std.math.floatMax,
24+
else => @compileError("bad type"),
25+
};
26+
const pad_vec: @Vector(VecLen(T), T) = @splat(pad);
27+
var vec: @Vector(VecLen(T), T) = simd.maskedLoadVecOr(T, pad_vec, mask, buf);
28+
vec = bisort.bitonicSort1V(T, vec);
29+
simd.maskedStoreVec(T, mask, buf, vec);
30+
return;
31+
}
32+
33+
const pivot = getPivot(T, buf);
34+
const mid = partition(T, buf, pivot);
35+
doVecQSort(T, buf[0..mid], remLevels - 1);
36+
doVecQSort(T, buf[mid + 1 .. buf.len], remLevels - 1);
1037
}
1138

12-
fn partition()
13-
{
39+
fn partition(comptime T: type, buf: []T, pivot: T) usize {
40+
_ = pivot;
41+
return (buf.len / 2);
1442
}
1543

16-
fn choosePivot()
17-
{
44+
fn getPivot(comptime T: type, buf: []T) T {
45+
return buf[0];
1846
}

0 commit comments

Comments
 (0)