V4.2.9: Fix Double Free and switch to cudaMalloc (non-async) for T4 stability

Electroiscoding · Electroiscoding · commit 24500f16c77d · 2026-01-28T23:39:46.000+05:30
diff --git a/Crayon_Colab_Notebook.py b/Crayon_Colab_Notebook.py
@@ -1,5 +1,5 @@
 """
-XERV CRAYON V4.2.8 - Production Omni-Backend Tokenizer
+XERV CRAYON V4.2.9 - Production Omni-Backend Tokenizer
 =======================================================
 Copy this ENTIRE script into a Google Colab cell and run it.
 
@@ -13,7 +13,7 @@
 import time
 
 print("=" * 70)
-print("XERV CRAYON V4.2.8 INSTALLATION")
+print("XERV CRAYON V4.2.9 INSTALLATION")
 print("=" * 70)
 
 # ... (rest of the script is same until Verification)
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "xerv-crayon"
-version = "4.2.8"
+version = "4.2.9"
 description = "Omni-Backend Tokenizer - CPU (AVX2/512), CUDA (NVIDIA), ROCm (AMD) with automatic hardware detection"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/src/crayon/__init__.py b/src/crayon/__init__.py
@@ -45,7 +45,7 @@
 
 from __future__ import annotations
 
-__version__ = "4.2.8"
+__version__ = "4.2.9"
 __author__ = "Xerv Research Engineering Division"
 
 # ============================================================================
diff --git a/src/crayon/c_ext/gpu_engine_cuda.cu b/src/crayon/c_ext/gpu_engine_cuda.cu
@@ -157,20 +157,19 @@ static PyObject* load_gpu(PyObject* self, PyObject* args) {
     char* arr_ptr = raw + 12;
     size_t bytes = size * sizeof(int32_t);
 
-    // FIX: Free old + guard
+    // FIX: Free old + guard (RAII handles actual free of old_ptrs upon exit)
     void* old_ptrs[3] = {d_cuda_base, d_cuda_check, d_cuda_values};
     CudaMemGuard guard(old_ptrs, 3);
-    if (cuda_loaded) {
-        CHECK_CUDA_ERR(cudaFree(d_cuda_base));
-        CHECK_CUDA_ERR(cudaFree(d_cuda_check));
-        CHECK_CUDA_ERR(cudaFree(d_cuda_values));
-    }
-
+    
+    // FIX: Remove manual free to prevent double-free with guard
+    
     // FIX: Async alloc + stream init
     if (!stream) CHECK_CUDA_ERR(cudaStreamCreate(&stream));
-    CHECK_CUDA_ERR(cudaMallocAsync(&d_cuda_base, bytes, stream));
-    CHECK_CUDA_ERR(cudaMallocAsync(&d_cuda_check, bytes, stream));
-    CHECK_CUDA_ERR(cudaMallocAsync(&d_cuda_values, bytes, stream));
+    
+    // Use standard cudaMalloc for maximum compatibility
+    CHECK_CUDA_ERR(cudaMalloc(&d_cuda_base, bytes));
+    CHECK_CUDA_ERR(cudaMalloc(&d_cuda_check, bytes));
+    CHECK_CUDA_ERR(cudaMalloc(&d_cuda_values, bytes));
 
     CHECK_CUDA_ERR(cudaMemcpyAsync(d_cuda_base, arr_ptr, bytes, cudaMemcpyHostToDevice, stream));
     CHECK_CUDA_ERR(cudaMemcpyAsync(d_cuda_check, arr_ptr + bytes, bytes, cudaMemcpyHostToDevice, stream));