@@ -116,10 +116,15 @@ bool CAssetConverter::acceleration_structure_patch_base::valid(const ILogicalDev
116
116
if (allowDataAccess && !limits.rayTracingPositionFetch )
117
117
return false ;
118
118
// can always build with the device
119
+ if (hostBuild)
119
120
#ifdef NBL_ACCELERATION_STRUCTURE_CONVERSION_HOST_READY
120
- if (hostBuild && !features.accelerationStructureHostCommands )
121
+ if (!features.accelerationStructureHostCommands )
121
122
#endif
123
+ {
124
+ if (auto logger=device->getLogger ();logger)
125
+ logger->log (" Host Acceleration Structure Builds are not yet supported!" ,system::ILogger::ELL_ERROR);
122
126
hostBuild = false ;
127
+ }
123
128
return true ;
124
129
}
125
130
CAssetConverter::patch_impl_t <ICPUBottomLevelAccelerationStructure>::patch_impl_t (const ICPUBottomLevelAccelerationStructure* blas)
@@ -4404,5 +4409,185 @@ ISemaphore::future_t<IQueue::RESULT> CAssetConverter::convert_impl(SReserveResul
4404
4409
return retval;
4405
4410
}
4406
4411
4412
+ #if 0
4413
+ // Lots of extra work, is why we didn't want to pursue it:
4414
+ // - TLAS builds should happen semi-concurrently to BLAS, but need to know what TLAS needs what BLAS to finish (scheduling)
4415
+ // + also device TLAS builds should know what Host Built BLAS they depend on, so that `pool.work()` is called until the BLAS's associated deferred op signals COMPLETE
4416
+ // - any AS should enqueue in a weird way with a sort of RLE, we allocate scratch until we can't then build whatever we can
4417
+ // - the list of outstanding BLAS and TLAS to build should get updated periodically
4418
+ // - overflow callbacks should call back into the BLAS and TLAS enqueuers and `pool.work()`
4419
+ struct ASBuilderPool
4420
+ {
4421
+ public:
4422
+ struct Worker
4423
+ {
4424
+ public:
4425
+ inline Worker(const ASBuilderPool* _pool) : pool(_pool), pushCount(0), executor(execute) {}
4426
+ inline ~Worker() {executor.join();}
4427
+
4428
+ inline void push(smart_refctd_ptr<IDeferredOperation>&& task)
4429
+ {
4430
+ std::lock_guard(queueLock);
4431
+ tasks.push_back(std::move(task));
4432
+ pushCount.fetch_add(1);
4433
+ pushCount.notify_one();
4434
+ }
4435
+
4436
+ private:
4437
+ inline void execute()
4438
+ {
4439
+ uint64_t oldTaskCount = 0;
4440
+ uint32_t taskIx = 0;
4441
+ while (pool->stop.test())
4442
+ {
4443
+ while (pushCount.load())
4444
+ pushCount.wait(oldTaskCount);
4445
+ size_t taskCount;
4446
+ IDeferredOperation* task;
4447
+ // grab the task under a lock so we're not in danger of vector reallocating
4448
+ {
4449
+ std::lock_guard(queueLock);
4450
+ taskCount = tasks.size();
4451
+ task = tasks[taskIx].get();
4452
+ }
4453
+ switch (task->execute())
4454
+ {
4455
+ case IDeferredOperation::STATUS::THREAD_IDLE:
4456
+ taskIx++; // next task
4457
+ break;
4458
+ default:
4459
+ {
4460
+ std::lock_guard(queueLock);
4461
+ tasks.erase(tasks.begin()+taskIx);
4462
+ break;
4463
+ }
4464
+ }
4465
+ if (taskIx>=taskCount)
4466
+ taskIx = 0;
4467
+ }
4468
+ }
4469
+
4470
+ std::mutex queueLock;
4471
+ const ASBuilderPool* pool;
4472
+ std::atomic_uint64_t pushCount;
4473
+ std::thread executor;
4474
+ core::vector<smart_refctd_ptr<IDeferredOperation>> tasks;
4475
+ };
4476
+
4477
+ inline ASBuilderPool(const uint16_t _workerCount, system::logger_opt_ptr _logger) : stop(), workerCount(_workerCount), nextWorkerPush(0), logger(_logger)
4478
+ {
4479
+ workers = std::make_unique<Worker[]>(workerCount);
4480
+ }
4481
+ inline ~ASBuilderPool()
4482
+ {
4483
+ finish();
4484
+ }
4485
+
4486
+ inline void finish()
4487
+ {
4488
+ while (work()) {}
4489
+ stop.test_and_set();
4490
+ stop.notify_one();
4491
+ workers = nullptr;
4492
+ }
4493
+
4494
+ struct Build
4495
+ {
4496
+ smart_refctd_ptr<IDeferredOperation> op;
4497
+ // WRONG: for every deferred op, there are multiple `gpuObj` and `hash` that get built by it
4498
+ IGPUAccelerationStructure* gpuObj;
4499
+ core::blake3_hash_t* hash;
4500
+ };
4501
+ inline void push(Build&& build)
4502
+ {
4503
+ auto op = build.op.get();
4504
+ if (!op->isPending())
4505
+ {
4506
+ logger.log("Host Acceleration Structure failed for \"%s\"",system::ILogger::ELL_ERROR,build.gpuObj->getObjectDebugName());
4507
+ // change the content hash on the reverse map to a NoContentHash
4508
+ *build.hash = CHashCache::NoContentHash;
4509
+ return;
4510
+ }
4511
+ // there's no true best way to pick the worker with least work
4512
+ for (uint16_t i=0; i<min<uint16_t>(op->getMaxConcurrency()-1,workerCount); i++)
4513
+ workers[(nextWorkerPush++)%workerCount].push(smart_refctd_ptr<IDeferredOperation>(op));
4514
+ buildsInProgress.push_back(std::move(build));
4515
+ }
4516
+
4517
+ inline bool empty() const {return buildsInProgress.empty();}
4518
+
4519
+ // The idea is to somehow get the overflow callbacks to call this
4520
+ inline bool work()
4521
+ {
4522
+ if (empty())
4523
+ return;
4524
+ auto build = buildsInProgress.begin()+buildIx;
4525
+ switch (build->op->execute())
4526
+ {
4527
+ case IDeferredOperation::STATUS::THREAD_IDLE:
4528
+ buildIx++; // next task
4529
+ break;
4530
+ case IDeferredOperation::STATUS::_ERROR:
4531
+ logger.log("Host Acceleration Structure failed for \"%s\"",system::ILogger::ELL_ERROR,build->gpuObj->getObjectDebugName());
4532
+ // change the content hash on the reverse map to a NoContentHash
4533
+ *build->hash = CHashCache::NoContentHash;
4534
+ [[fallthrough]];
4535
+ default:
4536
+ {
4537
+ buildsInProgress.erase(build);
4538
+ break;
4539
+ }
4540
+ }
4541
+ if (buildIx>=buildsInProgress.size())
4542
+ buildIx = 0;
4543
+ return buildsInProgress.empty();
4544
+ }
4545
+
4546
+ std::atomic_flag stop;
4547
+
4548
+ private:
4549
+ uint16_t workerCount;
4550
+ uint16_t nextWorkerPush = 0;
4551
+ system::logger_opt_ptr logger;
4552
+ std::unique_ptr<Worker[]> workers;
4553
+ core::vector<Build> buildsInProgress;
4554
+ uint32_t buildIx = 0;
4555
+ };
4556
+ ASBuilderPool hostBuilders(params.extraHostASBuildThreads,logger);
4557
+
4558
+ // crappy pseudocode
4559
+ auto hostBLASConvIt = reservations.m_blasConversions[1].begin();
4560
+ auto hostBLASConvEnd = reservations.m_blasConversions[1].end();
4561
+ while (hostBLASConvIt!=hostBLASConvEnd)
4562
+ {
4563
+ auto op = device->createDeferredOperation();
4564
+ if (!op)
4565
+ error, mark failure in staging;
4566
+ core::vector<IGPUBottomLevelAccelerationStructure::HostBuildInfo> infos;
4567
+ core::vector<IGPUBottomLevelAccelerationStructure::BuildRangeInfo> ranges;
4568
+ for (; hostBLASConvIt!=hostBLASConvEnd; hostBLASConvIt++)
4569
+ {
4570
+ void* scratch = hostBLASConvIt->scratchSize;
4571
+ if (!scratch)
4572
+ {
4573
+ if (infos.empty() && hostBuilders.empty())
4574
+ error mark failure in staging, can't even enqueue 1 build';
4575
+ else
4576
+ break;
4577
+ }
4578
+
4579
+ auto asset = hostBLASConvIt->canonical;
4580
+ asset->getGeometryPrimitiveCounts();
4581
+ ranges.push_back({
4582
+ .primitiveCount = 0,
4583
+ .primitiveByteOffset = 0,
4584
+ .firstVertex = 0,
4585
+ .transformByteOffset = 0
4586
+ });
4587
+ }
4588
+ if (!device->buildAccelerationStructures(op.get(),infos,ranges.data()))
4589
+ continue;
4590
+ }
4591
+ #endif
4407
4592
}
4408
4593
}
0 commit comments