Skip to content

Commit d033a68

Browse files
authored
Merge pull request #1018 from zeux/mlc-slang
demo: Add an example Slang meshlet decoder
2 parents 576967e + b2f2d86 commit d033a68

File tree

3 files changed

+213
-3
lines changed

3 files changed

+213
-3
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,8 @@ When using the C++ API, `meshopt_decodeMeshlet` will automatically deduce the el
427427

428428
Decoder is heavily optimized and can directly target write-combined memory; you can expect it to run at 7-10 GB/s on modern desktop CPUs.
429429

430+
> Applications that do most of the streaming decompression on the GPU can also decode meshlet data on the GPU if CPU decoding is inconvenient; an example [meshletdec.slang](./demo/meshletdec.slang) shader is provided for 32-bit output format, and can be easily adapted to other formats, including custom ones.
431+
430432
Note that meshlet encoding assumes that the meshlet data was optimized; meshlets should be processed using `meshopt_optimizeMeshlet` before encoding. Additionally, vertex references should have a high degree of reference locality; this can be achieved by building meshlets from meshes optimized for vertex cache/fetch, or linearizing the vertex reference data (and reordering the vertex buffer accordingly). Feeding unoptimized data into the encoder will produce poor compression ratios. Codec preserves the order of triangles, however it can rotate each triangle to improve compression ratio (which means the provoking vertex may change).
431433

432434
Meshlets without vertex references are supported; passing `NULL` vertices and `0` vertex count during encoding and decoding will produce encoded meshlets with just triangle data. Note that parameters supplied during decoding must match those used during encoding; if a meshlet was encoded with vertex references, it must be decoded with the same number of vertex references.

demo/meshletdec.slang

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
/**
2+
* meshletdec.slang - an example GPU decoder for meshlet data encoded using meshopt_encodeMeshlet
3+
* This is intended to be used as a starting point for applications that want to decode meshlet data on the GPU.
4+
*
5+
* The shader exposes an entrypoint, decodeMeshlets, that decodes a set of meshlets; each meshlet is decoded independently,
6+
* and the output vertex/triangle data is written as uint32 per element (triangle data is written as 0xccbbaa).
7+
* This matches the output format for meshopt_decodeMeshlet with vertex_size=4 triangle_size=4. If alternative formats are
8+
* needed, the code should be changed to output them; note that for triangle data, it may make sense to output data to shared
9+
* memory to be able to use larger aligned 32-bit writes to global memory after that.
10+
*
11+
* Copyright (C) 2016-2026, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
12+
* This code is distributed under the MIT License. See notice at the end of this file.
13+
*/
14+
15+
struct MeshletDesc
16+
{
17+
uint stream_offset;
18+
uint output_offset;
19+
uint16_t encoded_size;
20+
uint8_t vertex_count;
21+
uint8_t triangle_count;
22+
};
23+
24+
[[vk::binding(0)]]
25+
StructuredBuffer<uint8_t> gStream : register(t0);
26+
[[vk::binding(1)]]
27+
StructuredBuffer<MeshletDesc> gMeshlets : register(t1);
28+
[[vk::binding(2)]]
29+
RWStructuredBuffer<uint> gOutput : register(u2);
30+
[[vk::binding(3)]]
31+
cbuffer MeshletConfigCB : register(b3) { uint gMeshletCount; }
32+
33+
uint decodeVertices(uint out_vertices, uint ctrl, uint data, uint bound, uint vertex_count)
34+
{
35+
uint last = ~0u;
36+
37+
for (uint i = 0; i < vertex_count; i += 4)
38+
{
39+
if (data > bound)
40+
return ~0u;
41+
42+
uint code4 = uint(gStream[ctrl + i / 4]);
43+
44+
for (int k = 0; k < 4; ++k)
45+
{
46+
int code = ((code4 >> k) & 1) | ((code4 >> (k + 3)) & 2);
47+
int length = code4 == 0xff ? 4 : code;
48+
49+
// branchlessly read up to 4 bytes
50+
uint mask = (length == 4) ? ~0u : (1 << (8 * length)) - 1;
51+
uint v = (uint(gStream[data + 0]) | (uint(gStream[data + 1]) << 8) | (uint(gStream[data + 2]) << 16) | (uint(gStream[data + 3]) << 24)) & mask;
52+
53+
// unzigzag + 1
54+
uint d = (v >> 1) ^ -int(v & 1);
55+
uint r = last + d + 1;
56+
57+
if (i + k < vertex_count)
58+
gOutput[out_vertices + i + k] = r;
59+
60+
data += length;
61+
last = r;
62+
}
63+
}
64+
65+
return data;
66+
}
67+
68+
uint decodeTriangle(uint code, uint extra0, uint extra1, uint extra2, inout uint fifo0, inout uint fifo1, inout uint fifo2, inout uint next, inout uint extra)
69+
{
70+
// reuse: 0-1 extra vertices
71+
uint fifo = code < 4 ? fifo0 : (code < 8 ? fifo1 : fifo2);
72+
uint edge = fifo >> ((code << 3) & 16); // shift by 16 if bit 1 is set (odd edge for each triangle)
73+
uint c_reuse = (code & 1) == 1 ? extra0 : next;
74+
75+
// restart: 0-3 extra vertices
76+
uint extran = code & 3;
77+
uint a = extran > 0 ? extra0 : next;
78+
uint b = extran > 1 ? extra1 : next + (1 - extran);
79+
uint c = extran > 2 ? extra2 : next + (2 - extran);
80+
81+
// select between reuse and restart and repack triangle into edge format (0xcbac)
82+
a = code >= 12 ? a : (edge >> 8) & 0xff;
83+
b = code >= 12 ? b : edge & 0xff;
84+
c = code >= 12 ? c : c_reuse;
85+
86+
uint tri = c | (a << 8) | (b << 16) | (c << 24);
87+
88+
// advance next/extra; reuse codes use 1 lsb for extra count, restart codes use 2 lsbs
89+
uint extrab = code < 12 ? 1 : 3;
90+
next += extrab - code & extrab;
91+
extra += code & extrab;
92+
93+
// rotate fifo
94+
fifo2 = fifo1;
95+
fifo1 = fifo0;
96+
fifo0 = tri;
97+
98+
// output triangle is stored without extra edge vertex (0xcbac => 0xcba)
99+
return tri >> 8;
100+
}
101+
102+
uint decodeTriangles(uint out_triangles, uint codes, uint extra, uint bound, uint triangle_count)
103+
{
104+
uint next = 0;
105+
uint fifo0 = 0, fifo1 = 0, fifo2 = 0; // two edge fifo entries in one uint: 0xcbac
106+
107+
for (uint i = 0; i < triangle_count; i += 2)
108+
{
109+
if (extra > bound)
110+
return ~0u;
111+
112+
uint codeg = uint(gStream[codes + i / 2]);
113+
114+
// first triangle
115+
uint extra0 = uint(gStream[extra + 0]);
116+
uint extra1 = uint(gStream[extra + 1]);
117+
uint extra2 = uint(gStream[extra + 2]);
118+
uint tri = decodeTriangle(codeg & 15, extra0, extra1, extra2, fifo0, fifo1, fifo2, next, extra);
119+
120+
gOutput[out_triangles + i] = tri;
121+
122+
// second triangle, if any
123+
extra0 = uint(gStream[extra + 0]);
124+
extra1 = uint(gStream[extra + 1]);
125+
extra2 = uint(gStream[extra + 2]);
126+
tri = decodeTriangle(codeg >> 4, extra0, extra1, extra2, fifo0, fifo1, fifo2, next, extra);
127+
128+
if (i + 1 < triangle_count)
129+
gOutput[out_triangles + i + 1] = tri;
130+
}
131+
132+
return extra;
133+
}
134+
135+
int decodeMeshlet(uint out_vertices, uint vertex_count, uint out_triangles, uint triangle_count, uint buffer, uint buffer_size)
136+
{
137+
uint codes_size = (triangle_count + 1) / 2;
138+
uint ctrl_size = (vertex_count + 3) / 4;
139+
uint gap_size = (codes_size + ctrl_size < 16) ? 16 - (codes_size + ctrl_size) : 0;
140+
141+
if (buffer_size < codes_size + ctrl_size + gap_size)
142+
return -2;
143+
144+
uint end = buffer + buffer_size;
145+
uint codes = end - codes_size;
146+
uint ctrl = codes - ctrl_size;
147+
uint data = buffer;
148+
149+
// gap ensures we have at least 16 bytes available after bound; this allows decoder to over-read safely
150+
uint bound = ctrl - gap_size;
151+
152+
data = decodeVertices(out_vertices, ctrl, data, bound, vertex_count);
153+
if (data == ~0u)
154+
return -2;
155+
156+
data = decodeTriangles(out_triangles, codes, data, bound, triangle_count);
157+
if (data == ~0u)
158+
return -2;
159+
160+
return (data == bound) ? 0 : -3;
161+
}
162+
163+
[shader("compute")]
164+
[numthreads(32, 1, 1)]
165+
void decodeMeshlets(uint3 dispatch_thread_id: SV_DispatchThreadID)
166+
{
167+
uint meshlet_count = gMeshletCount;
168+
169+
uint tid = dispatch_thread_id.x;
170+
if (tid >= meshlet_count)
171+
return;
172+
173+
MeshletDesc desc = gMeshlets[tid];
174+
uint out_vertices = desc.output_offset;
175+
uint out_triangles = desc.output_offset + uint(desc.vertex_count);
176+
177+
int rc = decodeMeshlet(out_vertices, uint(desc.vertex_count), out_triangles, uint(desc.triangle_count), desc.stream_offset, uint(desc.encoded_size));
178+
179+
// if decoding failed, we write 0xff.. to the first word of the output data
180+
// this can be adjusted arbitrarily; for example, a separate buffer with a single status for the entire stream could be used
181+
// note that decoding fails only if the input data is corrupt; so this may not be required at all depending on the requirements
182+
if (rc < 0)
183+
gOutput[desc.output_offset] = ~0u;
184+
}
185+
186+
/**
187+
* Copyright (c) 2016-2026 Arseny Kapoulkine
188+
*
189+
* Permission is hereby granted, free of charge, to any person
190+
* obtaining a copy of this software and associated documentation
191+
* files (the "Software"), to deal in the Software without
192+
* restriction, including without limitation the rights to use,
193+
* copy, modify, merge, publish, distribute, sublicense, and/or sell
194+
* copies of the Software, and to permit persons to whom the
195+
* Software is furnished to do so, subject to the following
196+
* conditions:
197+
*
198+
* The above copyright notice and this permission notice shall be
199+
* included in all copies or substantial portions of the Software.
200+
*
201+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
202+
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
203+
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
204+
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
205+
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
206+
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
207+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
208+
* OTHER DEALINGS IN THE SOFTWARE.
209+
*/

src/meshletcodec.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -325,8 +325,6 @@ static const unsigned char* decodeVertices(V* vertices, const unsigned char* ctr
325325
{
326326
unsigned int last = ~0u;
327327

328-
static const unsigned int masks[] = {0, 0xff, 0xffff, 0xffffff, 0xffffffff};
329-
330328
for (size_t i = 0; i < vertex_count; i += 4)
331329
{
332330
if (data > bound)
@@ -340,7 +338,8 @@ static const unsigned char* decodeVertices(V* vertices, const unsigned char* ctr
340338
int length = code4 == 0xff ? 4 : code;
341339

342340
// branchlessly read up to 4 bytes
343-
unsigned int v = (data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24)) & masks[length];
341+
unsigned int mask = (length == 4) ? ~0u : (1 << (8 * length)) - 1;
342+
unsigned int v = (data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24)) & mask;
344343

345344
// unzigzag + 1
346345
unsigned int d = (v >> 1) ^ -int(v & 1);

0 commit comments

Comments
 (0)