Merge pull request #1 from Comfy-Org/refactor-1

Kosinkadink · web-flow · commit 7391815a0e28 · 2025-12-22T14:51:49.000-08:00
Refactors to quantization code, fixes to README + LICENSE
diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml
@@ -4,9 +4,15 @@ on:
   push:
     branches:
       - main
+    tags:
+      - "v*"
   pull_request:
   workflow_dispatch:
 
+permissions:
+  id-token: write
+  contents: read
+
 jobs:
   build_wheels:
     name: Build wheel for ${{ matrix.os }}
@@ -253,3 +259,24 @@ jobs:
           # Tests requiring CUDA will be skipped on CPU-only runners
           python -m pytest tests/ -v --tb=short
 
+  publish:
+    name: Publish to PyPI
+    needs: [build_wheels, test]
+    runs-on: ubuntu-latest
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    environment: pypi
+
+    steps:
+      - name: Download all wheel artifacts
+        uses: actions/download-artifact@v4
+        with:
+          pattern: wheels-*
+          path: dist/
+          merge-multiple: true
+
+      - name: List wheels to publish
+        run: ls -la dist/
+
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+
diff --git a/LICENSE b/LICENSE
@@ -1,3 +1,4 @@
+
                                  Apache License
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/
@@ -162,7 +163,7 @@
       other commercial damages or losses), even if such Contributor
       has been advised of the possibility of such damages.
 
-   9. Accepting Warranty or Additional Liability. While redistributing
+   9. Accepting Warranty or Additional Support. While redistributing
       the Work or Derivative Works thereof, You may choose to offer,
       and charge a fee for, acceptance of support, warranty, indemnity,
       or other liability obligations and/or rights consistent with this
@@ -186,7 +187,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright (c) 2025 Comfy Org. All rights reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/README.md b/README.md
@@ -68,18 +68,29 @@ pip install -e ".[dev]"
 # Skip build isolation for faster rebuilds
 pip install -e . --no-build-isolation -v
 
-# Install without CUDA backend
-pip install . --no-cuda
 ```
 
 #### Available Build Options
 
-| Option | Description | Default |
-|--------|-------------|---------|
-| `--no-cuda` | Build without CUDA backend | Enabled (build with CUDA) |
-| `--cuda-archs=...` | CUDA architectures to build for | Windows: `80;89;120f`<br>Linux: `80;89;90a;100a;120f` |
-| `--debug-build` | Build in debug mode with symbols | Disabled (Release) |
-| `--lineinfo` | Enable NVCC line information for profiling | Disabled |
+These options require using `setup.py` directly (not `pip install`):
+
+| Option | Command | Description | Default |
+|--------|---------|-------------|---------|
+| `--no-cuda` | `python setup.py bdist_wheel --no-cuda` | Build without CUDA backend | Enabled (build with CUDA) |
+| `--cuda-archs=...` | `python setup.py build_ext --cuda-archs="80;89"` | CUDA architectures to build for | Windows: `80;89;120f`<br>Linux: `80;89;90a;100f;120f` |
+| `--debug-build` | `python setup.py build_ext --debug-build` | Build in debug mode with symbols | Disabled (Release) |
+| `--lineinfo` | `python setup.py build_ext --lineinfo` | Enable NVCC line info for profiling | Disabled |
+
+```bash
+# Build without CUDA
+python setup.py bdist_wheel --no-cuda
+
+# Build with custom CUDA architectures
+python setup.py build_ext --cuda-archs="80;89" bdist_wheel
+
+# Debug build with line info for profiling
+python setup.py build_ext --debug-build --lineinfo bdist_wheel
+```
 
 
 
diff --git a/comfy_kitchen/tensor/base.py b/comfy_kitchen/tensor/base.py
@@ -1,6 +1,7 @@
 """Base classes for quantized tensors with typed layout parameters."""
 from __future__ import annotations
 
+import contextlib
 import dataclasses
 import logging
 from abc import ABC, abstractmethod
@@ -90,6 +91,11 @@ def dequantize(cls, qdata: torch.Tensor, params: Any) -> torch.Tensor:
     def get_plain_tensors(cls, qtensor: QuantizedTensor) -> tuple[torch.Tensor, ...]:
         raise NotImplementedError
 
+    @classmethod
+    @abstractmethod
+    def state_dict_tensors(cls, qdata: torch.Tensor, params: Any) -> dict[str, torch.Tensor]:
+        raise NotImplementedError
+
     @classmethod
     def supports_fast_matmul(cls) -> bool:
         """Check if fast quantized matmul is supported on current hardware."""
@@ -263,6 +269,10 @@ def dequantize(self) -> torch.Tensor:
             return full[slices]
         return full
 
+    def state_dict(self, prefix: str = "") -> dict[str, torch.Tensor]:
+        tensors = self._layout_cls.state_dict_tensors(self._qdata, self._params)
+        return {f"{prefix}{suffix}": tensor for suffix, tensor in tensors.items()}
+
     # ==================== Flatten/Unflatten Protocol ====================
 
     def __tensor_flatten__(self):
@@ -344,6 +354,23 @@ def dequantize_args(args):
 
 # ==================== Dispatch Handlers ====================
 
+def _parse_to_args(args, kwargs):
+    """Extract device and dtype from .to() arguments."""
+    device = kwargs.get("device")
+    dtype = kwargs.get("dtype")
+    for arg in args[1:]:
+        if isinstance(arg, torch.device):
+            device = arg
+        elif isinstance(arg, torch.dtype):
+            dtype = arg
+        elif isinstance(arg, str):
+            with contextlib.suppress(Exception):
+                device = torch.device(arg)
+    if isinstance(device, str):
+        device = torch.device(device)
+    return device, dtype
+
+
 def _handle_detach(qt, args, kwargs):
     return qt._copy_with(qdata=qt._qdata.detach())
 
@@ -352,29 +379,32 @@ def _handle_clone(qt, args, kwargs):
     return qt._copy_with(qdata=qt._qdata.clone())
 
 
-def _handle_to(qt, args, kwargs):
-    """Unified handler for device/dtype changes."""
-    target_device = kwargs.get("device")
-    target_dtype = kwargs.get("dtype")
-
-    if isinstance(target_device, str):
-        target_device = torch.device(target_device)
+def _handle_to(qt, args, kwargs, force_copy=False):
+    target_device, target_dtype = _parse_to_args(args, kwargs)
 
     needs_device = target_device is not None and target_device != qt._qdata.device
     needs_dtype = target_dtype is not None and target_dtype != qt._params.orig_dtype
 
-    if not needs_device and not needs_dtype:
+    if not needs_device and not needs_dtype and not force_copy:
         return qt
 
-    new_qdata = qt._qdata.to(device=target_device) if needs_device else qt._qdata
-    new_params = qt._params.clone()
     if needs_device:
-        new_params = new_params.to_device(target_device)
+        new_qdata = qt._qdata.to(device=target_device)
+        new_params = qt._params.to_device(target_device)
+    else:
+        new_qdata = qt._qdata.clone() if force_copy else qt._qdata
+        new_params = qt._params.clone()
+
     if needs_dtype:
         new_params.orig_dtype = target_dtype
+
     return qt._copy_with(qdata=new_qdata, params=new_params, clone_params=False)
 
 
+def _handle_to_copy(qt, args, kwargs):
+    return _handle_to(qt, args, kwargs, force_copy=True)
+
+
 def _handle_contiguous(qt, args, kwargs):
     if qt._qdata.is_contiguous():
         return qt
@@ -386,53 +416,46 @@ def _handle_is_contiguous(qt, args, kwargs):
 
 
 def _handle_copy_(qt, args, kwargs):
-    """Handle in-place copy between QuantizedTensors.
-
-    Raises:
-        TypeError: If src is not a QuantizedTensor or layouts don't match.
-    """
     dst, src = args[0], args[1]
-    non_blocking = kwargs.get("non_blocking", False)
-    if len(args) >= 3:
-        non_blocking = True
     if not isinstance(src, QuantizedTensor):
-        raise TypeError(
-            f"Cannot copy {type(src).__name__} to QuantizedTensor. "
-            "Use QuantizedTensor.from_float() to create a new quantized tensor."
-        )
+        raise TypeError(f"Cannot copy {type(src).__name__} to QuantizedTensor")
     if dst._layout_cls != src._layout_cls:
-        raise TypeError(
-            f"Cannot copy between different layouts: "
-            f"{dst._layout_cls.__name__} vs {src._layout_cls.__name__}"
-        )
+        raise TypeError(f"Layout mismatch: {dst._layout_cls.__name__} vs {src._layout_cls.__name__}")
+
+    dst_orig_dtype = dst._params.orig_dtype
+    non_blocking = kwargs.get("non_blocking", len(args) >= 3)
+
     dst._qdata.copy_(src._qdata, non_blocking=non_blocking)
     dst._params.copy_from(src._params, non_blocking=non_blocking)
+    dst._params.orig_dtype = dst_orig_dtype
     return dst
 
 
 def _handle_empty_like(qt, args, kwargs):
-    new_qdata = torch.empty_like(qt._qdata, **kwargs)
+    target_dtype = kwargs.pop("dtype", None)
+    target_device = kwargs.get("device")
+
+    new_qdata = torch.empty_like(qt._qdata, device=target_device)
     new_params = qt._params.clone()
-    if "device" in kwargs:
-        new_params = new_params.to_device(kwargs["device"])
-    return qt._copy_with(qdata=new_qdata, params=new_params, clone_params=False)
 
+    if target_device is not None:
+        new_params = new_params.to_device(target_device)
+    if target_dtype is not None:
+        new_params.orig_dtype = target_dtype
 
-def _handle_has_compatible_shallow_copy_type(qt, args, kwargs):
-    """QuantizedTensors support shallow copy compatibility."""
-    return True
+    return qt._copy_with(qdata=new_qdata, params=new_params, clone_params=False)
 
 
 _DISPATCH_TABLE = {
     torch.ops.aten.detach.default: _handle_detach,
     torch.ops.aten.clone.default: _handle_clone,
-    torch.ops.aten._to_copy.default: _handle_to,
+    torch.ops.aten._to_copy.default: _handle_to_copy,
     torch.ops.aten.to.dtype_layout: _handle_to,
     torch.ops.aten.contiguous.default: _handle_contiguous,
     torch.ops.aten.is_contiguous.default: _handle_is_contiguous,
     torch.ops.aten.copy_.default: _handle_copy_,
     torch.ops.aten.empty_like.default: _handle_empty_like,
-    torch.ops.aten._has_compatible_shallow_copy_type.default: _handle_has_compatible_shallow_copy_type,
+    torch.ops.aten._has_compatible_shallow_copy_type.default: lambda qt, args, kwargs: True,
 }
 
 # Layout-specific dispatch table: {torch_op: {layout_cls: handler}}
diff --git a/comfy_kitchen/tensor/fp8.py b/comfy_kitchen/tensor/fp8.py
@@ -44,13 +44,14 @@ class Params(BaseLayoutParams):
     def quantize(
         cls,
         tensor: torch.Tensor,
-        scale: torch.Tensor | float | None = None,
+        scale: torch.Tensor | float | str | None = None,
         dtype: torch.dtype = torch.float8_e4m3fn,
+        **kwargs,
     ) -> tuple[torch.Tensor, Params]:
         orig_dtype = tensor.dtype
         orig_shape = tuple(tensor.shape)
 
-        if scale is None:
+        if scale is None or scale == "recalculate":
             scale = torch.amax(tensor.abs()) / torch.finfo(dtype).max
 
         if not isinstance(scale, torch.Tensor):
@@ -69,6 +70,14 @@ def dequantize(cls, qdata: torch.Tensor, params: Params) -> torch.Tensor:
     def get_plain_tensors(cls, qtensor: QuantizedTensor) -> tuple[torch.Tensor, torch.Tensor]:
         return qtensor._qdata, qtensor._params.scale
 
+    @classmethod
+    def state_dict_tensors(cls, qdata: torch.Tensor, params: Params) -> dict[str, torch.Tensor]:
+        """Return key suffix → tensor mapping for serialization."""
+        return {
+            "": qdata,
+            "_scale": params.scale,
+        }
+
 
 # ==================== Helper Utilities ====================
 
diff --git a/comfy_kitchen/tensor/nvfp4.py b/comfy_kitchen/tensor/nvfp4.py
@@ -48,15 +48,16 @@ def _tensor_fields(self) -> list[str]:
     def quantize(
         cls,
         tensor: torch.Tensor,
-        scale: torch.Tensor | float | None = None,
+        scale: torch.Tensor | float | str | None = None,
+        **kwargs,
     ) -> tuple[torch.Tensor, Params]:
         if tensor.dim() != 2:
             raise ValueError(f"NVFP4 requires 2D tensor, got {tensor.dim()}D")
 
         orig_dtype = tensor.dtype
         orig_shape = tuple(tensor.shape)
 
-        if scale is None:
+        if scale is None or scale == "recalculate":
             scale = torch.amax(tensor.abs()) / (F8_E4M3_MAX * F4_E2M1_MAX)
 
         if not isinstance(scale, torch.Tensor):
@@ -86,6 +87,15 @@ def get_plain_tensors(
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         return qtensor._qdata, qtensor._params.scale, qtensor._params.block_scale
 
+    @classmethod
+    def state_dict_tensors(cls, qdata: torch.Tensor, params: Params) -> dict[str, torch.Tensor]:
+        """Return key suffix → tensor mapping for serialization."""
+        return {
+            "": qdata,
+            "_scale": params.block_scale,
+            "_scale_2": params.scale,
+        }
+
     @classmethod
     def get_padded_shape(cls, orig_shape: tuple[int, ...]) -> tuple[int, ...]:
         if len(orig_shape) != 2:
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,6 @@ classifiers = [
 ]
 dependencies = [
     "torch>=2.5.0",
-    "cuda-core[cu13]>=0.3.2,<1.0",
     "nvidia-cublas>=13.0.0",
 ]
 
diff --git a/setup.py b/setup.py
@@ -289,8 +289,12 @@ def get_cmdclass(has_extensions):
         class CUDABdistWheel(bdist_wheel):
             def finalize_options(self):
                 super().finalize_options()
-                # Add CUDA version as local version identifier (e.g., 0.1.0+cu128)
                 if not BUILD_NO_CUDA:
+                    # Set stable ABI tag: cp312-abi3 instead of cp312-cp312
+                    # This indicates the extension uses Python's Limited API
+                    self.py_limited_api = "cp312"
+
+                    # Add CUDA version as local version identifier (e.g., 0.1.0+cu128)
                     cuda_version = get_cuda_version()
                     if cuda_version and self.distribution.metadata.version:
                         cuda_tag = f"cu{cuda_version[0]}{cuda_version[1]}"
@@ -343,9 +347,9 @@ def get_packages():
 
     setup_kwargs.update({
         "packages": get_packages(),
-        "name": "comfy-kitchen-no-cuda",
+        "name": "comfy-kitchen",
         "version": version,
-        "description": f"{description} (CPU-only, no CUDA)",
+        "description": f"{description} (CPU-only)",
         "include_package_data": False,
         "install_requires": [
             "torch>=2.5.0",
diff --git a/tests/test_tensor.py b/tests/test_tensor.py

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,6 @@ classifiers = [`
`23`	`23`	`]`
`24`	`24`	`dependencies = [`
`25`	`25`	`"torch>=2.5.0",`
`26`		`- "cuda-core[cu13]>=0.3.2,<1.0",`
`27`	`26`	`"nvidia-cublas>=13.0.0",`
`28`	`27`	`]`
`29`	`28`