Merge pull request #393 from ricardosp4/nd-transpose

FrancescAlted · web-flow · commit 1e68d880024a · 2025-04-30T13:57:35.000+02:00
Performance optimization for `permute_dims`
diff --git a/bench/ndarray/transpose.ipynb b/bench/ndarray/transpose.ipynb
@@ -8,7 +8,12 @@
     "import blosc2\n",
     "import time\n",
     "import plotly.express as px\n",
-    "import pandas as pd"
+    "import pandas as pd\n",
+    "\n",
+    "from blosc2 import NDArray\n",
+    "from typing import Any\n",
+    "\n",
+    "import builtins"
    ],
    "id": "55765646130156ef",
    "outputs": [],
@@ -18,35 +23,104 @@
    "metadata": {},
    "cell_type": "code",
    "source": [
-    "sizes = [(100, 100), (500, 500), (500, 1000), (1000, 1000), (2000, 2000), (3000, 3000), (4000, 4000), (5000, 5000)]\n",
-    "sizes_mb = [(np.prod(size) * 8) / 2**20 for size in sizes]  # Convert to MB\n",
-    "results = {\"numpy\": [], \"blosc2\": []}"
+    "def new_permute_dims(arr: NDArray, axes: tuple[int] | list[int] | None = None, **kwargs: Any) -> NDArray:\n",
+    "    if np.isscalar(arr) or arr.ndim < 2:\n",
+    "        return arr\n",
+    "\n",
+    "    ndim = arr.ndim\n",
+    "    if axes is None:\n",
+    "        axes = tuple(range(ndim))[::-1]\n",
+    "    else:\n",
+    "        axes = tuple(axis if axis >= 0 else ndim + axis for axis in axes)\n",
+    "        if sorted(axes) != list(range(ndim)):\n",
+    "            raise ValueError(f\"axes {axes} is not a valid permutation of {ndim} dimensions\")\n",
+    "\n",
+    "    new_shape = tuple(arr.shape[axis] for axis in axes)\n",
+    "    if \"chunks\" not in kwargs or kwargs[\"chunks\"] is None:\n",
+    "        kwargs[\"chunks\"] = tuple(arr.chunks[axis] for axis in axes)\n",
+    "\n",
+    "    result = blosc2.empty(shape=new_shape, dtype=arr.dtype, **kwargs)\n",
+    "\n",
+    "    # Precomputar info por dimensión\n",
+    "    chunks = arr.chunks\n",
+    "    shape = arr.shape\n",
+    "\n",
+    "    for info in arr.iterchunks_info():\n",
+    "        coords = info.coords\n",
+    "        start_stop = [\n",
+    "            (coord * chunk, builtins.min(chunk * (coord + 1), dim))\n",
+    "            for coord, chunk, dim in zip(coords, chunks, shape)\n",
+    "        ]\n",
+    "\n",
+    "        src_slice = tuple(slice(start, stop) for start, stop in start_stop)\n",
+    "        dst_slice = tuple(slice(start_stop[ax][0], start_stop[ax][1]) for ax in axes)\n",
+    "\n",
+    "        transposed = np.transpose(arr[src_slice], axes=axes)\n",
+    "        result[dst_slice] = np.ascontiguousarray(transposed)\n",
+    "\n",
+    "    return result"
    ],
    "id": "1cfb7daa6eee1401",
    "outputs": [],
    "execution_count": null
   },
   {
-   "metadata": {},
+   "metadata": {
+    "jupyter": {
+     "is_executing": true
+    }
+   },
    "cell_type": "code",
    "source": [
-    "for method in [\"numpy\", \"blosc2\"]:\n",
-    "    for size in sizes:\n",
-    "        arr = np.random.rand(*size)\n",
-    "        arr_b2 = blosc2.asarray(arr)\n",
+    "def validate_results(result_orig, result_new, shape):\n",
+    "    if not np.allclose(result_orig[:], result_new[:]):\n",
+    "        raise ValueError(f\"Mismatch found for shape {shape}\")\n",
+    "\n",
+    "shapes = [\n",
+    "    (100, 100), (2000, 2000), (3000, 3000), (4000, 4000), (3000, 7000),\n",
+    "    (5000, 5000), (6000, 6000), (7000, 7000), (8000, 8000), (6000, 12000),\n",
+    "    (9000, 9000), (10000, 10000), (10500, 10500), (11000, 11000), (11500, 11500),\n",
+    "    (12000, 12000), (12500, 12500), (13000, 13000), (13500, 13500), (14000, 14000),\n",
+    "    (14500, 14500), (15000, 15000), (16000, 16000), (16500, 16500), (17000, 17000),\n",
+    "    (17500, 17500), (18000, 18000)\n",
+    "]\n",
+    "\n",
+    "sizes = []\n",
+    "time_total = []\n",
+    "chunk_labels = []\n",
     "\n",
-    "        start_time = time.perf_counter()\n",
+    "def numpy_permute(arr: np.ndarray, axes: tuple[int] | list[int] | None = None) -> np.ndarray:\n",
+    "    if axes is None:\n",
+    "        axes = range(arr.ndim)[::-1]\n",
+    "    return np.transpose(arr, axes=axes).copy()\n",
     "\n",
-    "        if method == \"numpy\":\n",
-    "            np.transpose(arr).copy()\n",
-    "        elif method == \"blosc2\":\n",
-    "            blosc2.transpose(arr_b2)\n",
+    "for shape in shapes:\n",
+    "    size_mb = (np.prod(shape) * 8) / (2 ** 20)\n",
+    "\n",
+    "    # NumPy transpose\n",
+    "    matrix_numpy = np.linspace(0, 1, np.prod(shape)).reshape(shape)\n",
+    "    t0 = time.perf_counter()\n",
+    "    result_numpy = numpy_permute(matrix_numpy)\n",
+    "    t1 = time.perf_counter()\n",
+    "    time_total.append(t1 - t0)\n",
+    "    sizes.append(size_mb)\n",
+    "    chunk_labels.append(\"numpy.transpose()\")\n",
     "\n",
-    "        end_time = time.perf_counter()\n",
-    "        time_b = end_time - start_time\n",
+    "    # New permute dims (optimized)\n",
+    "    matrix_blosc2 = blosc2.linspace(0, 1, np.prod(shape), shape=shape)\n",
+    "    t0 = time.perf_counter()\n",
+    "    result_new_perm = new_permute_dims(matrix_blosc2)\n",
+    "    t1 = time.perf_counter()\n",
+    "    time_total.append(t1 - t0)\n",
+    "    sizes.append(size_mb)\n",
+    "    chunk_labels.append(\"blosc2.permute_dims()\")\n",
+    "\n",
+    "    try:\n",
+    "        validate_results(result_new_perm, result_numpy, shape)\n",
+    "    except ValueError as e:\n",
+    "        print(e)\n",
     "\n",
-    "        print(f\"{method}: shape={size}, Performance = {time_b:.6f} s\")\n",
-    "        results[method].append(time_b)"
+    "    print(f\"Shape={shape}, Chunk={matrix_blosc2.chunks}: permute_dims={time_total[-2]:.6f}s, numpy={time_total[-1]:.6f}s\")"
    ],
    "id": "384d0ad7983a8d26",
    "outputs": [],
@@ -57,21 +131,21 @@
    "cell_type": "code",
    "source": [
     "df = pd.DataFrame({\n",
-    "    \"Matrix Size (MB)\": sizes_mb,\n",
-    "    \"NumPy Time (s)\": results[\"numpy\"],\n",
-    "    \"Blosc2 Time (s)\": results[\"blosc2\"]\n",
+    "    \"Matrix Size (MB)\": sizes,\n",
+    "    \"Time (s)\": time_total,\n",
+    "    \"Implementation\": chunk_labels\n",
     "})\n",
     "\n",
     "fig = px.line(df,\n",
     "              x=\"Matrix Size (MB)\",\n",
-    "              y=[\"NumPy Time (s)\", \"Blosc2 Time (s)\"],\n",
-    "              title=\"Performance of Matrix Transposition (NumPy vs Blosc2)\",\n",
-    "              labels={\"value\": \"Time (s)\", \"variable\": \"Method\"},\n",
+    "              y=\"Time (s)\",\n",
+    "              color=\"Implementation\",\n",
+    "              title=\"Performance: NumPy vs Blosc2\",\n",
+    "              width=1000, height=600,\n",
     "              markers=True)\n",
-    "\n",
     "fig.show()"
    ],
-   "id": "c71ffb39eb28992c",
+   "id": "786b8b7b5ea95225",
    "outputs": [],
    "execution_count": null
   },
@@ -81,15 +155,14 @@
    "source": [
     "%%time\n",
     "shapes = [\n",
-    "    (100, 100), (2000, 2000), (3000, 3000), (4000, 4000), (3000, 7000)\n",
-    "    # (5000, 5000), (6000, 6000), (7000, 7000), (8000, 8000), (6000, 12000),\n",
-    "    # (9000, 9000), (10000, 10000),\n",
-    "    # (10500, 10500), (11000, 11000), (11500, 11500), (12000, 12000),\n",
-    "    # (12500, 12500), (13000, 13000), (13500, 13500), (14000, 14000),\n",
-    "    # (14500, 14500), (15000, 15000), (15500, 15500), (16000, 16000),\n",
-    "    # (16500, 16500), (17000, 17000)\n",
+    "    (100, 100), (1000, 1000), (2000, 2000), (3000, 3000), (4000, 4000),\n",
+    "    (5000, 5000), (6000, 6000), (7000, 7000), (8000, 8000), (9000, 9000),\n",
+    "    (9500, 9500), (10000, 10000), (10500, 10500), (11000, 11000), (11500, 11500),\n",
+    "    (12000, 12000), (12500, 12500), (13000, 13000), (13500, 13500), (14000, 14000),\n",
+    "    (14500, 14500), (15000, 15000), (16000, 16000), (16500, 16500), (17000, 17000)\n",
     "]\n",
-    "chunkshapes = [None, (150, 300), (200, 500), (500, 200), (1000, 1000)]\n",
+    "\n",
+    "chunkshapes = [None, (150, 300), (1000, 1000), (4000, 4000)]\n",
     "\n",
     "sizes = []\n",
     "time_total = []\n",
@@ -111,18 +184,27 @@
     "    print(f\"NumPy:  Shape={shape}, Time = {numpy_time:.6f} s\")\n",
     "\n",
     "    for chunk in chunkshapes:\n",
-    "        matrix_blosc2 = blosc2.asarray(matrix_np, chunks=chunk)\n",
+    "        matrix_blosc2 = blosc2.asarray(matrix_np)\n",
+    "        matrix_blosc2 = blosc2.linspace(0, 1, np.prod(shape), shape=shape)\n",
     "\n",
     "        t0 = time.perf_counter()\n",
-    "        result_blosc2 = blosc2.transpose(matrix_blosc2)\n",
+    "        result_blosc2 = new_permute_dims(matrix_blosc2, chunks=chunk)\n",
     "        blosc2_time = time.perf_counter() - t0\n",
     "\n",
     "        sizes.append(size_mb)\n",
     "        time_total.append(blosc2_time)\n",
     "        chunk_labels.append(f\"{chunk[0]}x{chunk[1]}\" if chunk else \"Auto\")\n",
     "\n",
-    "        print(f\"Blosc2: Shape={shape}, Chunks = {matrix_blosc2.chunks}, Time = {blosc2_time:.6f} s\")\n",
-    "\n",
+    "        print(f\"Blosc2: Shape={shape}, Chunks = {result_blosc2.chunks}, Time = {blosc2_time:.6f} s\")"
+   ],
+   "id": "bcdd8aa5f65df561",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
     "df = pd.DataFrame({\n",
     "    \"Matrix Size (MB)\": sizes,\n",
     "    \"Time (s)\": time_total,\n",
@@ -135,17 +217,10 @@
     "              color=\"Chunk Shape\",\n",
     "              title=\"Performance of Matrix Transposition (Blosc2 vs NumPy)\",\n",
     "              labels={\"value\": \"Time (s)\", \"variable\": \"Metric\"},\n",
+    "              width=1000, height=600,\n",
     "              markers=True)\n",
     "fig.show()"
    ],
-   "id": "bcdd8aa5f65df561",
-   "outputs": [],
-   "execution_count": null
-  },
-  {
-   "metadata": {},
-   "cell_type": "code",
-   "source": "",
    "id": "1d2f48f370ba7e7a",
    "outputs": [],
    "execution_count": null
diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py
@@ -3876,11 +3876,11 @@ def matmul(x1: NDArray, x2: NDArray, **kwargs: Any) -> NDArray:
     r = x2.chunks[-1]
 
     for row in range(0, n, p):
-        row_end = (row + p) if (row + p) < n else n
+        row_end = builtins.min(row + p, n)
         for col in range(0, m, q):
-            col_end = (col + q) if (col + q) < m else m
+            col_end = builtins.min(col + q, m)
             for aux in range(0, k, r):
-                aux_end = (aux + r) if (aux + r) < k else k
+                aux_end = builtins.min(aux + r, k)
                 bx1 = x1[row:row_end, aux:aux_end]
                 bx2 = x2[aux:aux_end, col:col_end]
                 result[row:row_end, col:col_end] += np.matmul(bx1, bx2)
@@ -3951,6 +3951,7 @@ def permute_dims(arr: NDArray, axes: tuple[int] | list[int] | None = None, **kwa
            [[13, 14, 15, 16],
             [17, 18, 19, 20],
             [21, 22, 23, 24]]])
+
     >>> at = blosc2.permute_dims(a, axes=(1, 0, 2))
     >>> at[:]
     array([[[ 1,  2,  3,  4],
@@ -3960,37 +3961,39 @@ def permute_dims(arr: NDArray, axes: tuple[int] | list[int] | None = None, **kwa
            [[ 9, 10, 11, 12],
             [21, 22, 23, 24]]])
     """
-
     if np.isscalar(arr) or arr.ndim < 2:
         return arr
 
+    ndim = arr.ndim
+
     if axes is None:
-        axes = range(arr.ndim)[::-1]
+        axes = tuple(range(ndim))[::-1]
     else:
-        axes_transformed = tuple(axis if axis >= 0 else arr.ndim + axis for axis in axes)
-        if sorted(axes_transformed) != list(range(arr.ndim)):
-            raise ValueError(f"axes {axes} is not a valid permutation of {arr.ndim} dimensions")
-        axes = axes_transformed
+        axes = tuple(axis if axis >= 0 else ndim + axis for axis in axes)
+        if sorted(axes) != list(range(ndim)):
+            raise ValueError(f"axes {axes} is not a valid permutation of {ndim} dimensions")
 
     new_shape = tuple(arr.shape[axis] for axis in axes)
-    if "chunks" not in kwargs:
+    if "chunks" not in kwargs or kwargs["chunks"] is None:
         kwargs["chunks"] = tuple(arr.chunks[axis] for axis in axes)
 
     result = blosc2.empty(shape=new_shape, dtype=arr.dtype, **kwargs)
 
-    chunk_slices = [
-        [slice(start, builtins.min(dim, start + chunk)) for start in range(0, dim, chunk)]
-        for dim, chunk in zip(arr.shape, arr.chunks, strict=False)
-    ]
+    chunks = arr.chunks
+    shape = arr.shape
+
+    for info in arr.iterchunks_info():
+        coords = info.coords
+        start_stop = [
+            (coord * chunk, builtins.min(chunk * (coord + 1), dim))
+            for coord, chunk, dim in zip(coords, chunks, shape, strict=False)
+        ]
 
-    block_counts = [len(s) for s in chunk_slices]
-    grid = np.indices(block_counts).reshape(len(block_counts), -1).T
+        src_slice = tuple(slice(start, stop) for start, stop in start_stop)
+        dst_slice = tuple(slice(start_stop[ax][0], start_stop[ax][1]) for ax in axes)
 
-    for idx in grid:
-        block_slices = tuple(chunk_slices[axis][i] for axis, i in enumerate(idx))
-        block = arr[block_slices]
-        target_slices = tuple(block_slices[axis] for axis in axes)
-        result[target_slices] = np.transpose(block, axes=axes).copy()
+        transposed = np.transpose(arr[src_slice], axes=axes)
+        result[dst_slice] = np.ascontiguousarray(transposed)
 
     return result
 
@@ -4024,7 +4027,7 @@ def transpose(x, **kwargs: Any) -> NDArray:
         stacklevel=2,
     )
 
-    # If arguments are dimension < 2 they are returned
+    # If arguments are dimension < 2, they are returned
     if np.isscalar(x) or x.ndim < 2:
         return x
 
diff --git a/tests/ndarray/test_transpose.py b/tests/ndarray/test_transpose.py