Add: Ephemeral GPU executors if no device is passed

ashvardanian · ashvardanian · commit 6431900690b0 · 2025-08-30T09:43:35.000Z
diff --git a/c/stringzillas.cuh b/c/stringzillas.cuh
@@ -218,6 +218,28 @@ struct gpu_scope_t {
 };
 szs::cuda_executor_t &get_executor(gpu_scope_t &scope) noexcept { return scope.executor; }
 sz::gpu_specs_t get_specs(gpu_scope_t const &scope) noexcept { return scope.specs; }
+
+/** Cached default GPU context (device 0) to avoid repeated scheduling boilerplate */
+struct default_gpu_context_t {
+    sz::status_t status = sz::status_t::unknown_k;
+    szs::cuda_executor_t executor;
+    sz::gpu_specs_t specs;
+};
+
+inline default_gpu_context_t &default_gpu_context() {
+    static default_gpu_context_t ctx = [] {
+        default_gpu_context_t result;
+        auto specs_status = szs::gpu_specs_fetch(result.specs, 0);
+        if (specs_status.status != sz::status_t::success_k) {
+            result.status = specs_status.status;
+            return result;
+        }
+        auto exec_status = result.executor.try_scheduling(0);
+        result.status = exec_status.status;
+        return result;
+    }();
+    return ctx;
+}
 #endif
 
 struct device_scope_t {
@@ -293,6 +315,16 @@ sz_status_t szs_levenshtein_distances_for_(
                     get_executor(device_scope), get_specs(device_scope));
                 result = static_cast<sz_status_t>(status);
             }
+            // Try ephemeral GPU on default scope (device 0)
+            else if (std::holds_alternative<default_scope_t>(device->variants)) {
+                auto &ctx = default_gpu_context();
+                if (ctx.status != sz::status_t::success_k) { result = static_cast<sz_status_t>(ctx.status); }
+                else {
+                    sz::status_t status = engine_variant( //
+                        a_container, b_container, results_strided, ctx.executor, ctx.specs);
+                    result = static_cast<sz_status_t>(status);
+                }
+            }
             else { result = sz_device_code_mismatch_k; }
 #else
             result = sz_status_unknown_k; // GPU support is not enabled
@@ -452,6 +484,15 @@ sz_status_t szs_needleman_wunsch_scores_for_(
                     get_executor(device_scope), get_specs(device_scope));
                 result = static_cast<sz_status_t>(status);
             }
+            else if (std::holds_alternative<default_scope_t>(device->variants)) {
+                auto &ctx = default_gpu_context();
+                if (ctx.status != sz::status_t::success_k) { result = static_cast<sz_status_t>(ctx.status); }
+                else {
+                    sz::status_t status = engine_variant( //
+                        a_container, b_container, results_strided, ctx.executor, ctx.specs);
+                    result = static_cast<sz_status_t>(status);
+                }
+            }
             else { result = sz_status_unknown_k; }
 #else
             result = sz_status_unknown_k; // GPU support is not enabled
@@ -540,6 +581,25 @@ sz_status_t szs_smith_waterman_scores_for_(
                     get_executor(device_scope), get_specs(device_scope));
                 result = static_cast<sz_status_t>(status);
             }
+            else if (std::holds_alternative<default_scope_t>(device->variants)) {
+                sz::gpu_specs_t specs;
+                auto specs_status = szs::gpu_specs_fetch(specs, 0);
+                if (specs_status.status != sz::status_t::success_k) {
+                    result = static_cast<sz_status_t>(specs_status.status);
+                }
+                else {
+                    szs::cuda_executor_t executor;
+                    auto exec_status = executor.try_scheduling(0);
+                    if (exec_status.status != sz::status_t::success_k) {
+                        result = static_cast<sz_status_t>(exec_status.status);
+                    }
+                    else {
+                        sz::status_t status = engine_variant( //
+                            a_container, b_container, results_strided, executor, specs);
+                        result = static_cast<sz_status_t>(status);
+                    }
+                }
+            }
             else { result = sz_status_unknown_k; }
 #else
             result = sz_status_unknown_k; // GPU support is not enabled
@@ -659,14 +719,23 @@ sz_status_t szs_fingerprints_for_(                                      //
         auto const min_counts_rows = //
             strided_rows<sz_u32_t> {reinterpret_cast<sz_ptr_t>(min_counts), dims, min_counts_stride, texts_count};
 
-        // CPU fallback hashers can only work with CPU-compatible device scopes
+        // GPU fallback hashers can work with GPU scope, or default scope via an ephemeral GPU executor
         if (std::holds_alternative<gpu_scope_t>(device->variants)) {
             auto &device_scope = std::get<gpu_scope_t>(device->variants);
             sz::status_t status = fallback_hashers(                //
                 texts_container, min_hashes_rows, min_counts_rows, //
                 get_executor(device_scope), get_specs(device_scope));
             result = static_cast<sz_status_t>(status);
         }
+        else if (std::holds_alternative<default_scope_t>(device->variants)) {
+            auto &ctx = default_gpu_context();
+            if (ctx.status != sz::status_t::success_k) { result = static_cast<sz_status_t>(ctx.status); }
+            else {
+                sz::status_t status = fallback_hashers( //
+                    texts_container, min_hashes_rows, min_counts_rows, ctx.executor, ctx.specs);
+                result = static_cast<sz_status_t>(status);
+            }
+        }
         else { result = sz_status_unknown_k; }
     };
 #endif // SZ_USE_CUDA
@@ -704,6 +773,22 @@ sz_status_t szs_fingerprints_for_(                                      //
                     if (result != sz_success_k) break;
                 }
             }
+            else if (std::holds_alternative<default_scope_t>(device->variants)) {
+                auto &ctx = default_gpu_context();
+                if (ctx.status != sz::status_t::success_k) { result = static_cast<sz_status_t>(ctx.status); }
+                else {
+                    for (std::size_t i = 0; i < unrolled_hashers.size(); ++i) {
+                        auto &engine_variant = unrolled_hashers[i];
+                        sz::status_t status = engine_variant(                                             //
+                            texts_container,                                                              //
+                            min_hashes_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
+                            min_counts_rows.template shifted<fingerprint_slice_k>(i * bytes_per_slice_k), //
+                            ctx.executor, ctx.specs);
+                        result = static_cast<sz_status_t>(status);
+                        if (result != sz_success_k) break;
+                    }
+                }
+            }
             else { result = sz_status_unknown_k; }
 #else
             result = sz_status_unknown_k; // GPU support is not enabled
diff --git a/python/stringzillas.c b/python/stringzillas.c
@@ -135,14 +135,8 @@ static inline sz_bool_t try_swap_to_unified_allocator(PyObject *strs_obj) {
  *  @brief Helper function to determine if unified memory is required based on capabilities and device scope.
  *  @param[in] capabilities The capabilities bitmask of the current engine.
  */
-static inline sz_bool_t requires_unified_memory(sz_capability_t capabilities, szs_device_scope_t device_handle) {
-    // Only relevant if CUDA capability is enabled
-    if ((capabilities & sz_cap_cuda_k) == 0) return sz_false_k;
-
-    // Check that the executor is a GPU device scope
-    sz_size_t gpu_device = 0;
-    if (szs_device_scope_get_gpu_device(device_handle, &gpu_device) == sz_success_k) return sz_true_k;
-    return sz_false_k;
+static inline sz_bool_t requires_unified_memory(sz_capability_t capabilities) {
+    return (capabilities & sz_cap_cuda_k) != 0;
 }
 
 #pragma endregion
@@ -452,8 +446,8 @@ static PyObject *LevenshteinDistances_call(LevenshteinDistances *self, PyObject
     sz_status_t (*kernel_punned)(szs_levenshtein_distances_t, szs_device_scope_t, void *, void *, sz_size_t *,
                                  sz_size_t) = NULL;
 
-    // Swap allocators only when using CUDA with a GPU device
-    if (requires_unified_memory(self->capabilities, device_handle))
+    // Swap allocators only when using CUDA with a GPU device (inputs must be unified)
+    if (requires_unified_memory(self->capabilities))
         if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
 
     // Handle 32-bit tape inputs
@@ -748,8 +742,8 @@ static PyObject *LevenshteinDistancesUTF8_call(LevenshteinDistancesUTF8 *self, P
     sz_status_t (*kernel_punned)(szs_levenshtein_distances_t, szs_device_scope_t, void *, void *, sz_size_t *,
                                  sz_size_t) = NULL;
 
-    // Swap allocators only when using CUDA with a GPU device
-    if (requires_unified_memory(self->capabilities, device_handle))
+    // Swap allocators when engine supports CUDA
+    if (requires_unified_memory(self->capabilities))
         if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
 
     // Handle 32-bit tape inputs
@@ -1079,8 +1073,8 @@ static PyObject *NeedlemanWunsch_call(NeedlemanWunsch *self, PyObject *args, PyO
     sz_status_t (*kernel_punned)(szs_needleman_wunsch_scores_t, szs_device_scope_t, void const *, void const *,
                                  sz_ssize_t *, sz_size_t) = NULL;
 
-    // Swap allocators only when using CUDA with a GPU device
-    if (requires_unified_memory(self->capabilities, device_handle))
+    // Swap allocators only when using CUDA with a GPU device (inputs must be unified)
+    if (requires_unified_memory(self->capabilities))
         if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
 
     // Handle 32-bit tape inputs
@@ -1393,8 +1387,8 @@ static PyObject *SmithWaterman_call(SmithWaterman *self, PyObject *args, PyObjec
     sz_status_t (*kernel_punned)(szs_smith_waterman_scores_t, szs_device_scope_t, void const *, void const *,
                                  sz_ssize_t *, sz_size_t) = NULL;
 
-    // Swap allocators only when using CUDA with a GPU device
-    if (requires_unified_memory(self->capabilities, device_handle))
+    // Swap allocators only when using CUDA with a GPU device (inputs must be unified)
+    if (requires_unified_memory(self->capabilities))
         if (!try_swap_to_unified_allocator(a_obj) || !try_swap_to_unified_allocator(b_obj)) return NULL;
 
     // Handle 32-bit tape inputs
@@ -1744,7 +1738,7 @@ static PyObject *Fingerprints_call(Fingerprints *self, PyObject *args, PyObject
     }
 
     // Swap allocators only when using CUDA with a GPU device (inputs must be unified)
-    sz_bool_t need_unified = requires_unified_memory(self->capabilities, device_handle);
+    sz_bool_t need_unified = requires_unified_memory(self->capabilities);
     if (need_unified)
         if (!try_swap_to_unified_allocator(texts_obj)) return NULL;
 
diff --git a/scripts/test_stringzilla.py b/scripts/test_stringzilla.py
@@ -6,6 +6,7 @@
 
     uv pip install numpy pyarrow pytest pytest-repeat
     uv pip install -e . --force-reinstall --no-build-isolation
+    uv run --no-project python -m pytest scripts/test_stringzilla.py -s -x
 
 Recommended flags for better diagnostics:
 
diff --git a/scripts/test_stringzillas.py b/scripts/test_stringzillas.py
@@ -7,6 +7,14 @@
     uv pip install numpy pyarrow pytest pytest-repeat affine-gaps
     SZ_TARGET=stringzillas-cpus uv pip install -e . --force-reinstall --no-build-isolation
     uv run --no-project python -c "import stringzillas; print(stringzillas.__capabilities__)"
+    uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
+
+To run for the CUDA backend:
+
+    uv pip install numpy pyarrow pytest pytest-repeat affine-gaps
+    SZ_TARGET=stringzillas-cuda uv pip install -e . --force-reinstall --no-build-isolation
+    uv run --no-project python -c "import stringzillas; print(stringzillas.__capabilities__)"
+    uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
 
 Recommended flags for better diagnostics:
 
@@ -20,13 +28,6 @@
 Example:
 
     uv run --no-project python -X faulthandler -m pytest scripts/test_stringzillas.py -s -vv --maxfail=1 --full-trace
-
-To run for the CUDA backend:
-
-    uv pip install numpy pyarrow pytest pytest-repeat affine-gaps
-    SZ_TARGET=stringzillas-cuda uv pip install -e . --force-reinstall --no-build-isolation
-    uv run --no-project python -c "import stringzillas; print(stringzillas.__capabilities__)"
-    uv run --no-project python -m pytest scripts/test_stringzillas.py -s -x
 """
 
 from random import choice, randint, seed