Add section on device functions

gjbex · gjbex · commit 026693eec4d4 · 2021-12-22T21:25:49.000+01:00
diff --git a/source-code/gpu/numba.ipynb b/source-code/gpu/numba.ipynb
@@ -10,11 +10,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 60,
    "id": "9811bf38-83f3-4f27-a899-8ebe823977cb",
    "metadata": {},
    "outputs": [],
    "source": [
+    "import itertools\n",
     "import math\n",
     "import matplotlib.pyplot as plt\n",
     "%matplotlib inline\n",
@@ -1148,6 +1149,96 @@
     "Giving threads more work improves the performance by a factor of 15."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "5f81b104-489d-4368-a54d-695ba3c2e04e",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Device functions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a2cc724a-5058-46a5-85d8-e56be4e45b1e",
+   "metadata": {},
+   "source": [
+    "Device functions can be easily defined using the optional `device` argument in the decorator.  Such functions can only be called from kernels or other device functions, never from the host."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "id": "15ca4d83-8bdc-480a-bf5a-3bc418b2ba81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cuda.jit(device=True)\n",
+    "def distance_dev(a, b):\n",
+    "    return math.sqrt(a**2 + b**2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "id": "3cb16af4-4cdc-4593-bb70-b74797190890",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "candidates = np.array(list(itertools.permutations(range(1, 10))), dtype=np.int64)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "7bb4eb11-bfd3-42ab-9ffd-fbd5a5fda0ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nr_threads_per_block = 128\n",
+    "n = 10_000*nr_threads_per_block\n",
+    "a = np.random.uniform(size=n).astype(np.float32)\n",
+    "b = np.random.uniform(size=n).astype(np.float32)\n",
+    "expected = np.sqrt(a**2 + b**2)\n",
+    "out = np.empty_like(a)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "id": "fc4145df-af45-4ea4-a5bd-36e4f2b54384",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cuda.jit\n",
+    "def distance_kernel(a, b, out):\n",
+    "    start = cuda.grid(1)\n",
+    "    stride = cuda.gridsize(1)\n",
+    "    for i in range(start, a.shape[0], stride):\n",
+    "        out[i] = distance_dev(a[i], b[i])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "73688f2e-4c4c-447d-8e25-2dc363c73cf7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "distance_kernel[nr_blocks, nr_threads_per_block](a, b, out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "3554a442-c69f-4556-a0ef-81f278cca7cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.testing.assert_almost_equal(out, expected)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "7015ff19-38b6-4b9b-8e17-2bef649c3774",
@@ -1209,6 +1300,263 @@
    "source": [
     "Thanks to coalesced memory access, the performance is again improved."
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "da85580a-cb83-49f5-b6a5-f45a3972b532",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cuda.jit\n",
+    "def map_2d_kernel(A):\n",
+    "    idx = cuda.grid(1)\n",
+    "    x, y = cuda.grid(2)\n",
+    "    A[x, y] = idx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "b80132e5-3ebf-40f4-b4b2-2a919a3b16fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = np.zeros((16, 16), dtype=np.int32)\n",
+    "nr_blocks = (1, 1)\n",
+    "nr_threads_per_block = (16, 16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "52c803e6-7eee-4c93-b5ad-79d630ab4874",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/gjb/miniconda3/envs/numba/lib/python3.9/site-packages/numba/cuda/compiler.py:865: NumbaPerformanceWarning: Grid size (1) < 2 * SM count (68) will likely result in GPU under utilization due to low occupancy.\n",
+      "  warn(NumbaPerformanceWarning(msg))\n",
+      "/home/gjb/miniconda3/envs/numba/lib/python3.9/site-packages/numba/cuda/cudadrv/devicearray.py:790: NumbaPerformanceWarning: Host array used in CUDA kernel will incur copy overhead to/from device.\n",
+      "  warn(NumbaPerformanceWarning(msg))\n"
+     ]
+    }
+   ],
+   "source": [
+    "map_2d_kernel[nr_blocks, nr_threads_per_block](A)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "82a167e3-b6f2-46c2-b0a7-81cd6c64226a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],\n",
+       "       [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],\n",
+       "       [ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2],\n",
+       "       [ 3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3],\n",
+       "       [ 4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4],\n",
+       "       [ 5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5],\n",
+       "       [ 6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6],\n",
+       "       [ 7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7],\n",
+       "       [ 8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8],\n",
+       "       [ 9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9],\n",
+       "       [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10],\n",
+       "       [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11],\n",
+       "       [12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12],\n",
+       "       [13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13],\n",
+       "       [14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14],\n",
+       "       [15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15]],\n",
+       "      dtype=int32)"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "A"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "65f42313-ad4e-47e5-ba52-025b31fe2362",
+   "metadata": {},
+   "source": [
+    "# Row versus column sum and memory access"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e7774f98-24d5-4865-a60f-209551fc912f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n = 2**14\n",
+    "nr_threads_per_block = 128\n",
+    "nr_blocks = n//nr_threads_per_block"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "58e0cb31-254a-42fd-ad04-dfa89e0f0482",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = np.random.uniform(size=(n, n)).astype(np.float32)\n",
+    "A_dev = cuda.to_device(A)\n",
+    "sums = np.zeros((n, ), dtype=np.float32)\n",
+    "sums_dev = cuda.to_device(sums)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "cf3b9126-184c-4e00-9d0d-476726224d6b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cuda.jit\n",
+    "def row_sum_kernel(A, sums):\n",
+    "    idx = cuda.grid(1)\n",
+    "    row_sum = 0.0\n",
+    "    for j in range(A.shape[1]):\n",
+    "        row_sum += A[idx][j]\n",
+    "    sums[idx] = row_sum"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "debebeb5-77cb-4c47-b5a1-cd26cc1abbd9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "row_sum_kernel[nr_blocks, nr_threads_per_block](A_dev, sums_dev)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "704b7952-9b2c-4702-97fb-54978837b5b7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([8197.585 , 8185.74  , 8168.036 , ..., 8218.73  , 8190.8506,\n",
+       "       8219.317 ], dtype=float32)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sums_dev.copy_to_host()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "63fc33cf-a494-44f1-9e1f-6c864dcb6a65",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "15.4 ms ± 91.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit row_sum_kernel[nr_blocks, nr_threads_per_block](A_dev, sums_dev); cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "13f25c85-ba41-44c9-bc18-7156884e45ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cuda.jit\n",
+    "def col_sum_kernel(A, sums):\n",
+    "    idx = cuda.grid(1)\n",
+    "    col_sum = 0.0\n",
+    "    for i in range(A.shape[0]):\n",
+    "        col_sum += A[i][idx]\n",
+    "    sums[idx] = col_sum"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "dba6c68b-9cca-4aa3-b89e-28157e3ab037",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "col_sum_kernel[nr_blocks, nr_threads_per_block](A_dev, sums_dev)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "b13ca87d-8852-4446-9574-0b9a87e5d98c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([8220.059 , 8201.879 , 8192.549 , ..., 8229.558 , 8151.019 ,\n",
+       "       8127.3354], dtype=float32)"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sums_dev.copy_to_host()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "32802150-78e9-453a-83ae-f8b16c8ac8a5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "5.31 ms ± 20.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit col_sum_kernel[nr_blocks, nr_threads_per_block](A_dev, sums_dev); cuda.synchronize()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29f89a63-c827-484a-98dc-7d81311bd25f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {