Skip to content

Commit c25ec1a

Browse files
committed
Avoid crash
1 parent 4e8dc4f commit c25ec1a

File tree

1 file changed

+79
-30
lines changed

1 file changed

+79
-30
lines changed

src/cuda/voxel.cu

Lines changed: 79 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2418,11 +2418,10 @@ __device__ const int edgeCorners[12][2] = {
24182418
};
24192419

24202420
__global__ void BuildVerticesKernel(const VoxelCudaNaive* voxels, float3 bb_min,
2421-
float3 resolution,
2422-
int3 vn, // voxel_num
2423-
float iso_level, int* d_edgeVertexIds,
2424-
int* d_vtxCounter, float3* d_vertices,
2425-
float weight, int max_vertices_num) {
2421+
float3 resolution, int3 vn, float iso_level,
2422+
int* d_edgeVertexIds, int* d_vtxCounter,
2423+
float3* d_vertices, float weight,
2424+
int max_vertices_num, int* d_overflow) {
24262425
int ix = blockIdx.x * blockDim.x + threadIdx.x;
24272426
int iy = blockIdx.y * blockDim.y + threadIdx.y;
24282427
int iz = blockIdx.z * blockDim.z + threadIdx.z;
@@ -2434,51 +2433,57 @@ __global__ void BuildVerticesKernel(const VoxelCudaNaive* voxels, float3 bb_min,
24342433
int edges = d_edgeTable[cubeIndex];
24352434
if (edges == 0) return;
24362435

2437-
// ベースとなる flat インデックス
2438-
// int baseFlat = iz * vn.y * vn.x + iy * vn.x + ix;
2439-
24402436
for (int e = 0; e < 12; e++) {
24412437
if (!(edges & (1 << e))) continue;
24422438

2443-
// 1) エッジキーを計算して頂点IDテーブルを予約
24442439
int key = computeEdgeKey(ix, iy, iz, e, vn.x, vn.y, vn.z);
2440+
2441+
// Acquire ownership of this edge.
24452442
int old = atomicCAS(&d_edgeVertexIds[key], -1, 0);
2446-
if (old != -1) continue; // 既に誰かが生成済み
2443+
if (old != -1) continue; // someone else already owns or created it
24472444

2448-
// 2) 新しい頂点ID を確保
24492445
int vid = atomicAdd(d_vtxCounter, 1);
24502446

2451-
if (max_vertices_num < vid) {
2447+
// IMPORTANT: bounds check must be vid >= max_vertices_num (0-based).
2448+
if (vid >= max_vertices_num) {
2449+
// Roll back the lock so future retries won't see a stale 0.
2450+
atomicExch(&d_edgeVertexIds[key], -1);
2451+
atomicExch(d_overflow, 1);
24522452
return;
24532453
}
24542454

2455+
// Publish the vertex id.
24552456
atomicExch(&d_edgeVertexIds[key], vid);
24562457

2457-
// 3) エッジに対応する 2 つのコーナー番号
24582458
int c0_id = edgeCorners[e][0];
24592459
int c1_id = edgeCorners[e][1];
24602460

2461-
// 4) それぞれのコーナーのグリッド座標 (gx,gy,gz) を計算
24622461
int gx0 = ix + cornerOffset[c0_id][0];
24632462
int gy0 = iy + cornerOffset[c0_id][1];
24642463
int gz0 = iz + cornerOffset[c0_id][2];
24652464
int gx1 = ix + cornerOffset[c1_id][0];
24662465
int gy1 = iy + cornerOffset[c1_id][1];
24672466
int gz1 = iz + cornerOffset[c1_id][2];
24682467

2469-
// 5) flat index に戻す(必要なら使わずに直接 sdf 参照も可)
24702468
int flat0 = gz0 * vn.y * vn.x + gy0 * vn.x + gx0;
24712469
int flat1 = gz1 * vn.y * vn.x + gy1 * vn.x + gx1;
24722470

2473-
// 6) 実ワールド座標を計算
2471+
// Optional safety: avoid division by zero if update_num can be 0.
2472+
int u0 = voxels[flat0].update_num;
2473+
int u1 = voxels[flat1].update_num;
2474+
if (u0 <= 0 || u1 <= 0 || weight == 0.0f) {
2475+
// Invalidate this edge entry and signal failure (safe fallback).
2476+
atomicExch(&d_edgeVertexIds[key], -1);
2477+
atomicExch(d_overflow, 1);
2478+
return;
2479+
}
2480+
24742481
float3 p1 = voxel_idx2pos(make_int3(gx0, gy0, gz0), bb_min, resolution);
24752482
float3 p2 = voxel_idx2pos(make_int3(gx1, gy1, gz1), bb_min, resolution);
24762483

2477-
// 7) SDF 値を取得
2478-
float v1 = voxels[flat0].sdf_sum / float(voxels[flat0].update_num * weight);
2479-
float v2 = voxels[flat1].sdf_sum / float(voxels[flat1].update_num * weight);
2484+
float v1 = voxels[flat0].sdf_sum / float(u0 * weight);
2485+
float v2 = voxels[flat1].sdf_sum / float(u1 * weight);
24802486

2481-
// 8) 線形補間で頂点位置を求めて書き込み
24822487
d_vertices[vid] = VertexInterp(p1, p2, v1, v2, iso_level);
24832488
}
24842489
}
@@ -2487,36 +2492,53 @@ __global__ void BuildVerticesKernel(const VoxelCudaNaive* voxels, float3 bb_min,
24872492
__global__ void BuildFacesKernel(const VoxelCudaNaive* voxels, float3 bb_min,
24882493
float3 resolution, int3 vn, float iso_level,
24892494
int* d_edgeVertexIds, int* d_idxCounter,
2490-
int* d_faces, int max_faces) {
2495+
int* d_faces, int max_faces, int* d_overflow) {
24912496
int ix = blockIdx.x * blockDim.x + threadIdx.x;
24922497
int iy = blockIdx.y * blockDim.y + threadIdx.y;
24932498
int iz = blockIdx.z * blockDim.z + threadIdx.z;
24942499
if (ix >= vn.x - 1 || iy >= vn.y - 1 || iz >= vn.z - 1) return;
2500+
24952501
int cubeIndex =
24962502
calcCubeIndex(voxels, ix, iy, iz, vn, bb_min, resolution, iso_level);
24972503
if (cubeIndex < 0) return;
2504+
24982505
int* tri = (int*)(&d_triTable[cubeIndex][0]);
24992506
for (int i = 0; tri[i] != -1; i += 3) {
25002507
int e0 = tri[i], e1 = tri[i + 1], e2 = tri[i + 2];
2508+
25012509
int k0 = computeEdgeKey(ix, iy, iz, e0, vn.x, vn.y, vn.z);
25022510
int k1 = computeEdgeKey(ix, iy, iz, e1, vn.x, vn.y, vn.z);
25032511
int k2 = computeEdgeKey(ix, iy, iz, e2, vn.x, vn.y, vn.z);
2512+
25042513
int v0 = d_edgeVertexIds[k0];
25052514
int v1 = d_edgeVertexIds[k1];
25062515
int v2 = d_edgeVertexIds[k2];
2516+
2517+
// Optional safety: if any edge is missing/locked, fail fast.
2518+
if (v0 < 0 || v1 < 0 || v2 < 0) {
2519+
atomicExch(d_overflow, 1);
2520+
return;
2521+
}
2522+
25072523
int idx = atomicAdd(d_idxCounter, 3);
2508-
if (max_faces < idx) {
2524+
2525+
// IMPORTANT: idx writes idx, idx+1, idx+2 (0-based).
2526+
if (idx + 2 >= max_faces) {
2527+
atomicExch(d_overflow, 1);
25092528
return;
25102529
}
2530+
25112531
d_faces[idx + 0] = v2;
25122532
d_faces[idx + 1] = v1;
25132533
d_faces[idx + 2] = v0;
25142534
}
25152535
}
2536+
25162537
__global__ void BuildFacesKernelWithNormal(
25172538
const VoxelCudaNaive* voxels, float3 bb_min, float3 resolution, int3 vn,
25182539
float iso_level, int* d_edgeVertexIds, int* d_idxCounter, int* d_faces,
2519-
float3* d_vertices, float3* d_face_normals, int max_faces) {
2540+
float3* d_vertices, float3* d_face_normals, int max_faces,
2541+
int* d_overflow) {
25202542
int ix = blockIdx.x * blockDim.x + threadIdx.x;
25212543
int iy = blockIdx.y * blockDim.y + threadIdx.y;
25222544
int iz = blockIdx.z * blockDim.z + threadIdx.z;
@@ -2534,7 +2556,8 @@ __global__ void BuildFacesKernelWithNormal(
25342556
int v1 = d_edgeVertexIds[k1];
25352557
int v2 = d_edgeVertexIds[k2];
25362558
int idx = atomicAdd(d_idxCounter, 3);
2537-
if (max_faces < idx) {
2559+
if (idx + 2 >= max_faces) {
2560+
atomicExch(d_overflow, 1);
25382561
return;
25392562
}
25402563
d_faces[idx + 0] = v2;
@@ -2959,6 +2982,9 @@ class VoxelGridCudaNaive::Impl {
29592982
cudaMemcpyToSymbol(c_voxel_num, &voxel_num_, sizeof(int3));
29602983
cudaMemcpyToSymbol(c_trunc, &option_.truncation_band, sizeof(float));
29612984

2985+
cudaMalloc(&d_overflow_, sizeof(int));
2986+
cudaMemset(d_overflow_, 0, sizeof(int));
2987+
29622988
return true;
29632989
}
29642990

@@ -3331,18 +3357,28 @@ class VoxelGridCudaNaive::Impl {
33313357
constexpr float iso_level = 0.f;
33323358
constexpr int tri_ratio = 2;
33333359
while (true) {
3360+
cudaMemset(d_overflow_, 0, sizeof(int));
3361+
33343362
BuildVerticesKernel<<<grid, block>>>(
33353363
d_voxels_, bb_min_, resolution_, voxel_num_, iso_level,
3336-
d_edgeVertexIds, d_vtxCounter, d_vertices, option_.weight, max_tris_);
3364+
d_edgeVertexIds, d_vtxCounter, d_vertices, option_.weight, max_tris_,
3365+
d_overflow_);
3366+
33373367
checkCudaErrors(cudaGetLastError());
33383368
checkCudaErrors(cudaDeviceSynchronize());
3369+
3370+
int h_overflow = 0;
3371+
cudaMemcpy(&h_overflow, d_overflow_, sizeof(int), cudaMemcpyDeviceToHost);
3372+
33393373
int h_vcount = 0;
33403374
cudaMemcpy(&h_vcount, d_vtxCounter, sizeof(int), cudaMemcpyDeviceToHost);
3341-
if (h_vcount < max_tris_) {
3375+
3376+
if (h_overflow == 0 && h_vcount <= max_tris_) {
33423377
num_vertices_ = h_vcount;
33433378
break;
33443379
}
3345-
// If memory is not enough, reallocate
3380+
3381+
// Retry (buffer insufficient or kernel signaled overflow)
33463382
cudaMemset(d_vtxCounter, 0, sizeof(int));
33473383
cudaMemset(d_edgeVertexIds, -1, sizeof(int) * numEdges);
33483384
EnsureTriangleVertexMemory(h_vcount * tri_ratio);
@@ -3355,22 +3391,28 @@ class VoxelGridCudaNaive::Impl {
33553391
}
33563392

33573393
while (true) {
3394+
3395+
cudaMemset(d_overflow_, 0, sizeof(int));
3396+
33583397
if (with_face_normals) {
33593398
BuildFacesKernelWithNormal<<<grid, block>>>(
33603399
d_voxels_, bb_min_, resolution_, voxel_num_, iso_level,
33613400
d_edgeVertexIds, d_idxCounter, d_faces, d_vertices, d_face_normals,
3362-
max_faces_);
3401+
max_faces_, d_overflow_);
33633402
} else {
33643403
BuildFacesKernel<<<grid, block>>>(
33653404
d_voxels_, bb_min_, resolution_, voxel_num_, iso_level,
3366-
d_edgeVertexIds, d_idxCounter, d_faces, max_faces_);
3405+
d_edgeVertexIds, d_idxCounter, d_faces, max_faces_, d_overflow_);
33673406
}
33683407
checkCudaErrors(cudaGetLastError());
33693408
checkCudaErrors(cudaDeviceSynchronize());
33703409

3410+
int h_overflow = 0;
3411+
cudaMemcpy(&h_overflow, d_overflow_, sizeof(int), cudaMemcpyDeviceToHost);
3412+
33713413
int h_icount = 0;
33723414
cudaMemcpy(&h_icount, d_idxCounter, sizeof(int), cudaMemcpyDeviceToHost);
3373-
if (h_icount < max_faces_) {
3415+
if (h_overflow == 0 && h_icount < max_faces_) {
33743416
num_faces_ = h_icount / 3;
33753417
break;
33763418
}
@@ -3746,6 +3788,11 @@ class VoxelGridCudaNaive::Impl {
37463788
max_tris_ = 0;
37473789

37483790
d_mesh_process_buf_.Free();
3791+
3792+
if (d_overflow_) {
3793+
cudaFree(d_overflow_);
3794+
d_overflow_ = nullptr;
3795+
}
37493796
}
37503797

37513798
void EnsureTriangleVertexMemory(int tris_num) {
@@ -3816,6 +3863,8 @@ class VoxelGridCudaNaive::Impl {
38163863
int num_faces_{0};
38173864
int num_vertices_{0};
38183865

3866+
int* d_overflow_ = nullptr; // device flag: 0 ok, 1 overflow/invalid
3867+
38193868
ugu::RemoveSmallConnectedComponentsBuf d_mesh_process_buf_;
38203869
};
38213870

0 commit comments

Comments
 (0)