V4.2.6: Smart Arch Detection + Serial Build + Realtime Colab Logs

Electroiscoding · Electroiscoding · commit c9533024e692 · 2026-01-28T22:52:12.000+05:30
diff --git a/Crayon_Colab_Notebook.py b/Crayon_Colab_Notebook.py
@@ -1,5 +1,5 @@
 """
-XERV CRAYON V4.2.5 - Production Omni-Backend Tokenizer
+XERV CRAYON V4.2.6 - Production Omni-Backend Tokenizer
 =======================================================
 Copy this ENTIRE script into a Google Colab cell and run it.
 
@@ -13,7 +13,7 @@
 import time
 
 print("=" * 70)
-print("XERV CRAYON V4.2.5 INSTALLATION")
+print("XERV CRAYON V4.2.6 INSTALLATION")
 print("=" * 70)
 
 # 1. Environment Check
@@ -23,6 +23,7 @@
     print(f"      PyTorch: {torch.__version__}")
     if torch.cuda.is_available():
         print(f"      CUDA: {torch.version.cuda} ({torch.cuda.get_device_name(0)})")
+        print("      * Smart Build: Will compile ONLY for this GPU architecture")
     else:
         print("      CUDA: Not available (CPU only)")
 except ImportError:
@@ -62,29 +63,36 @@
 print(f"      {v_check.stdout.strip()}")
 
 
-# 5. Build & Install
-print("\n[5/7] Compiling and Installing (this may take 2-3 minutes)...")
+# 5. Build & Install (Streaming Output)
+print("\n[5/7] Compiling and Installing (Streaming Logs)...")
 print("-" * 70)
 
 build_env = os.environ.copy()
-build_env["MAX_JOBS"] = "4"  # Prevent OOM
-build_env["CUDA_HOME"] = "/usr/local/cuda"
+build_env["MAX_JOBS"] = "1"      # Force serial build to prevent OOM
+build_env["cuda_home"] = "/usr/local/cuda"
+
+# Stream output line-by-line
+cmd = [sys.executable, "-m", "pip", "install", "-v", "--no-build-isolation", clone_dir]
+process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=build_env, text=True)
+
+# Print output while running
+while True:
+    line = process.stdout.readline()
+    if not line and process.poll() is not None:
+        break
+    if line:
+        print(line.rstrip())
+
+rc = process.poll()
+print("-" * 70)
 
-# We use check_call so it throws error on failure
-try:
-    subprocess.check_call(
-        [sys.executable, "-m", "pip", "install", "-v", "--no-build-isolation", clone_dir],
-        env=build_env
-    )
-except subprocess.CalledProcessError:
+if rc != 0:
     print("\n" + "!" * 70)
     print("FATAL ERROR: Installation failed!")
-    print("Please check the error log above.")
+    print(f"Exit Code: {rc}")
     print("!" * 70)
     sys.exit(1)
 
-print("-" * 70)
-
 
 # 6. Verification
 print("\n[6/7] Verifying installation...")
@@ -95,7 +103,7 @@
 
 try:
     import crayon
-    print(f"      Succcess! Installed version: {crayon.get_version()}")
+    print(f"      Success! Installed version: {crayon.get_version()}")
     backends = crayon.check_backends()
     print(f"      Backends: {backends}")
 except ImportError as e:
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "xerv-crayon"
-version = "4.2.5"
+version = "4.2.6"
 description = "Omni-Backend Tokenizer - CPU (AVX2/512), CUDA (NVIDIA), ROCm (AMD) with automatic hardware detection"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/setup.py b/setup.py
@@ -5,7 +5,8 @@
 Features:
 - PyTorch CUDAExtension for reliable NVCC compilation
 - Automatic fallback to CPU if CUDA/ROCm unavailable
-- MAX_JOBS control to prevent OOM on smaller instances
+- Smart Architecture Detection: Compiles only for the active GPU to save RAM/Time
+- MAX_JOBS control to prevent OOM
 """
 
 import os
@@ -18,14 +19,14 @@
 # VERSION
 # ============================================================================
 
-VERSION = "4.2.5"
+VERSION = "4.2.6"
 
 # ============================================================================
 # PRE-FLIGHT CHECKS
 # ============================================================================
 
-# Control parallelism to prevent OOM
-os.environ["MAX_JOBS"] = os.environ.get("MAX_JOBS", "4")
+# Default to serial build to prevent OOM on Colab/Free tiers
+os.environ["MAX_JOBS"] = os.environ.get("MAX_JOBS", "1")
 
 def log(msg: str, level: str = "INFO") -> None:
     print(f"[CRAYON-BUILD] {msg}", flush=True)
@@ -49,6 +50,45 @@ def log(msg: str, level: str = "INFO") -> None:
 HAS_ROCM = os.path.exists(os.path.join(ROCM_HOME, "bin", "hipcc"))
 
 
+# ============================================================================
+# ARCHITECTURE SELECTION
+# ============================================================================
+
+def get_cuda_arch_flags():
+    """
+    Determine the best CUDA architecture flags.
+    If CRAYON_GENERIC_BUILD=1, build for all common architectures (for PyPI wheels).
+    Otherwise, build ONLY for the detected GPU (faster, less RAM).
+    """
+    base_flags = ["-O3", "-std=c++17", "--expt-relaxed-constexpr"]
+    
+    # Generic build for distribution (Wheel)
+    if os.environ.get("CRAYON_GENERIC_BUILD", "0") == "1":
+        log("Building for ALL common CUDA architectures (Generic Wheel)")
+        return base_flags + [
+            "-gencode=arch=compute_70,code=sm_70", # V100
+            "-gencode=arch=compute_75,code=sm_75", # T4
+            "-gencode=arch=compute_80,code=sm_80", # A100
+            "-gencode=arch=compute_86,code=sm_86", # RTX 3090
+            "-gencode=arch=compute_90,code=sm_90", # H100
+        ]
+    
+    # Local build (Colab/User Machine)
+    if TORCH_CUDA_AVAILABLE:
+        try:
+            major, minor = torch.cuda.get_device_capability()
+            arch = f"{major}{minor}"
+            log(f"Detected GPU: SM {major}.{minor} -> Compiling for sm_{arch} ONLY")
+            return base_flags + [f"-gencode=arch=compute_{arch},code=sm_{arch}"]
+        except Exception as e:
+            log(f"Error detecting GPU capability: {e}. Falling back to common archs.")
+    
+    # Fallback if detection fails or no GPU present (but CUDA_HOME exists)
+    return base_flags + [
+        "-gencode=arch=compute_75,code=sm_75", # T4 (Safe default for Colab)
+    ]
+
+
 # ============================================================================
 # EXTENSION CONFIGURATION
 # ============================================================================
@@ -73,24 +113,18 @@ def log(msg: str, level: str = "INFO") -> None:
 
 # --- 2. CUDA Extension (via PyTorch) ---
 if TORCH_CUDA_AVAILABLE and not FORCE_CPU and CUDAExtension:
-    log(f"Configuring CUDA extension (PyTorch {torch.__version__}, CUDA {torch.version.cuda})")
+    nvcc_flags = get_cuda_arch_flags()
+    log(f"Configuring CUDA extension (max_jobs={os.environ['MAX_JOBS']})")
+    
     ext_modules.append(CUDAExtension(
         name="crayon.c_ext.crayon_cuda",
         sources=["src/crayon/c_ext/gpu_engine_cuda.cu"],
         extra_compile_args={
             "cxx": ["-O3", "-std=c++17"],
-            "nvcc": [
-                "-O3", "-std=c++17",
-                "--expt-relaxed-constexpr",
-                # Broad architecture support
-                "-gencode=arch=compute_70,code=sm_70",
-                "-gencode=arch=compute_75,code=sm_75",
-                "-gencode=arch=compute_80,code=sm_80",
-                "-gencode=arch=compute_86,code=sm_86",
-                "-gencode=arch=compute_90,code=sm_90",
-            ],
+            "nvcc": nvcc_flags,
         },
     ))
+
 elif not FORCE_CPU and CUDAExtension:
     log("Skipping CUDA extension (PyTorch CUDA not found or CUDA_HOME missing)")
 
diff --git a/src/crayon/__init__.py b/src/crayon/__init__.py
@@ -45,7 +45,7 @@
 
 from __future__ import annotations
 
-__version__ = "4.2.5"
+__version__ = "4.2.6"
 __author__ = "Xerv Research Engineering Division"
 
 # ============================================================================