@@ -218,6 +218,28 @@ struct gpu_scope_t {
218218};
219219szs::cuda_executor_t &get_executor (gpu_scope_t &scope) noexcept { return scope.executor ; }
220220sz::gpu_specs_t get_specs (gpu_scope_t const &scope) noexcept { return scope.specs ; }
221+
222+ /* * Cached default GPU context (device 0) to avoid repeated scheduling boilerplate */
223+ struct default_gpu_context_t {
224+ sz::status_t status = sz::status_t ::unknown_k;
225+ szs::cuda_executor_t executor;
226+ sz::gpu_specs_t specs;
227+ };
228+
229+ inline default_gpu_context_t &default_gpu_context () {
230+ static default_gpu_context_t ctx = [] {
231+ default_gpu_context_t result;
232+ auto specs_status = szs::gpu_specs_fetch (result.specs , 0 );
233+ if (specs_status.status != sz::status_t ::success_k) {
234+ result.status = specs_status.status ;
235+ return result;
236+ }
237+ auto exec_status = result.executor .try_scheduling (0 );
238+ result.status = exec_status.status ;
239+ return result;
240+ }();
241+ return ctx;
242+ }
221243#endif
222244
223245struct device_scope_t {
@@ -293,6 +315,16 @@ sz_status_t szs_levenshtein_distances_for_(
293315 get_executor (device_scope), get_specs (device_scope));
294316 result = static_cast <sz_status_t >(status);
295317 }
318+ // Try ephemeral GPU on default scope (device 0)
319+ else if (std::holds_alternative<default_scope_t >(device->variants )) {
320+ auto &ctx = default_gpu_context ();
321+ if (ctx.status != sz::status_t ::success_k) { result = static_cast <sz_status_t >(ctx.status ); }
322+ else {
323+ sz::status_t status = engine_variant ( //
324+ a_container, b_container, results_strided, ctx.executor , ctx.specs );
325+ result = static_cast <sz_status_t >(status);
326+ }
327+ }
296328 else { result = sz_device_code_mismatch_k; }
297329#else
298330 result = sz_status_unknown_k; // GPU support is not enabled
@@ -452,6 +484,15 @@ sz_status_t szs_needleman_wunsch_scores_for_(
452484 get_executor (device_scope), get_specs (device_scope));
453485 result = static_cast <sz_status_t >(status);
454486 }
487+ else if (std::holds_alternative<default_scope_t >(device->variants )) {
488+ auto &ctx = default_gpu_context ();
489+ if (ctx.status != sz::status_t ::success_k) { result = static_cast <sz_status_t >(ctx.status ); }
490+ else {
491+ sz::status_t status = engine_variant ( //
492+ a_container, b_container, results_strided, ctx.executor , ctx.specs );
493+ result = static_cast <sz_status_t >(status);
494+ }
495+ }
455496 else { result = sz_status_unknown_k; }
456497#else
457498 result = sz_status_unknown_k; // GPU support is not enabled
@@ -540,6 +581,25 @@ sz_status_t szs_smith_waterman_scores_for_(
540581 get_executor (device_scope), get_specs (device_scope));
541582 result = static_cast <sz_status_t >(status);
542583 }
584+ else if (std::holds_alternative<default_scope_t >(device->variants )) {
585+ sz::gpu_specs_t specs;
586+ auto specs_status = szs::gpu_specs_fetch (specs, 0 );
587+ if (specs_status.status != sz::status_t ::success_k) {
588+ result = static_cast <sz_status_t >(specs_status.status );
589+ }
590+ else {
591+ szs::cuda_executor_t executor;
592+ auto exec_status = executor.try_scheduling (0 );
593+ if (exec_status.status != sz::status_t ::success_k) {
594+ result = static_cast <sz_status_t >(exec_status.status );
595+ }
596+ else {
597+ sz::status_t status = engine_variant ( //
598+ a_container, b_container, results_strided, executor, specs);
599+ result = static_cast <sz_status_t >(status);
600+ }
601+ }
602+ }
543603 else { result = sz_status_unknown_k; }
544604#else
545605 result = sz_status_unknown_k; // GPU support is not enabled
@@ -659,14 +719,23 @@ sz_status_t szs_fingerprints_for_( //
659719 auto const min_counts_rows = //
660720 strided_rows<sz_u32_t > {reinterpret_cast <sz_ptr_t >(min_counts), dims, min_counts_stride, texts_count};
661721
662- // CPU fallback hashers can only work with CPU-compatible device scopes
722+ // GPU fallback hashers can work with GPU scope, or default scope via an ephemeral GPU executor
663723 if (std::holds_alternative<gpu_scope_t >(device->variants )) {
664724 auto &device_scope = std::get<gpu_scope_t >(device->variants );
665725 sz::status_t status = fallback_hashers ( //
666726 texts_container, min_hashes_rows, min_counts_rows, //
667727 get_executor (device_scope), get_specs (device_scope));
668728 result = static_cast <sz_status_t >(status);
669729 }
730+ else if (std::holds_alternative<default_scope_t >(device->variants )) {
731+ auto &ctx = default_gpu_context ();
732+ if (ctx.status != sz::status_t ::success_k) { result = static_cast <sz_status_t >(ctx.status ); }
733+ else {
734+ sz::status_t status = fallback_hashers ( //
735+ texts_container, min_hashes_rows, min_counts_rows, ctx.executor , ctx.specs );
736+ result = static_cast <sz_status_t >(status);
737+ }
738+ }
670739 else { result = sz_status_unknown_k; }
671740 };
672741#endif // SZ_USE_CUDA
@@ -704,6 +773,22 @@ sz_status_t szs_fingerprints_for_( //
704773 if (result != sz_success_k) break ;
705774 }
706775 }
776+ else if (std::holds_alternative<default_scope_t >(device->variants )) {
777+ auto &ctx = default_gpu_context ();
778+ if (ctx.status != sz::status_t ::success_k) { result = static_cast <sz_status_t >(ctx.status ); }
779+ else {
780+ for (std::size_t i = 0 ; i < unrolled_hashers.size (); ++i) {
781+ auto &engine_variant = unrolled_hashers[i];
782+ sz::status_t status = engine_variant ( //
783+ texts_container, //
784+ min_hashes_rows.template shifted <fingerprint_slice_k>(i * bytes_per_slice_k), //
785+ min_counts_rows.template shifted <fingerprint_slice_k>(i * bytes_per_slice_k), //
786+ ctx.executor , ctx.specs );
787+ result = static_cast <sz_status_t >(status);
788+ if (result != sz_success_k) break ;
789+ }
790+ }
791+ }
707792 else { result = sz_status_unknown_k; }
708793#else
709794 result = sz_status_unknown_k; // GPU support is not enabled
0 commit comments