|
10 | 10 | },
|
11 | 11 | {
|
12 | 12 | "cell_type": "code",
|
13 |
| - "execution_count": 2, |
| 13 | + "execution_count": 60, |
14 | 14 | "id": "9811bf38-83f3-4f27-a899-8ebe823977cb",
|
15 | 15 | "metadata": {},
|
16 | 16 | "outputs": [],
|
17 | 17 | "source": [
|
| 18 | + "import itertools\n", |
18 | 19 | "import math\n",
|
19 | 20 | "import matplotlib.pyplot as plt\n",
|
20 | 21 | "%matplotlib inline\n",
|
|
1148 | 1149 | "Giving threads more work improves the performance by a factor of 15."
|
1149 | 1150 | ]
|
1150 | 1151 | },
|
| 1152 | + { |
| 1153 | + "cell_type": "markdown", |
| 1154 | + "id": "5f81b104-489d-4368-a54d-695ba3c2e04e", |
| 1155 | + "metadata": { |
| 1156 | + "tags": [] |
| 1157 | + }, |
| 1158 | + "source": [ |
| 1159 | + "# Device functions" |
| 1160 | + ] |
| 1161 | + }, |
| 1162 | + { |
| 1163 | + "cell_type": "markdown", |
| 1164 | + "id": "a2cc724a-5058-46a5-85d8-e56be4e45b1e", |
| 1165 | + "metadata": {}, |
| 1166 | + "source": [ |
| 1167 | + "Device functions can be easily defined using the optional `device` argument in the decorator. Such functions can only be called from kernels or other device functions, never from the host." |
| 1168 | + ] |
| 1169 | + }, |
| 1170 | + { |
| 1171 | + "cell_type": "code", |
| 1172 | + "execution_count": 81, |
| 1173 | + "id": "15ca4d83-8bdc-480a-bf5a-3bc418b2ba81", |
| 1174 | + "metadata": {}, |
| 1175 | + "outputs": [], |
| 1176 | + "source": [ |
| 1177 | + "@cuda.jit(device=True)\n", |
| 1178 | + "def distance_dev(a, b):\n", |
| 1179 | + " return math.sqrt(a**2 + b**2)" |
| 1180 | + ] |
| 1181 | + }, |
| 1182 | + { |
| 1183 | + "cell_type": "code", |
| 1184 | + "execution_count": 71, |
| 1185 | + "id": "3cb16af4-4cdc-4593-bb70-b74797190890", |
| 1186 | + "metadata": {}, |
| 1187 | + "outputs": [], |
| 1188 | + "source": [ |
| 1189 | + "candidates = np.array(list(itertools.permutations(range(1, 10))), dtype=np.int64)" |
| 1190 | + ] |
| 1191 | + }, |
| 1192 | + { |
| 1193 | + "cell_type": "code", |
| 1194 | + "execution_count": 88, |
| 1195 | + "id": "7bb4eb11-bfd3-42ab-9ffd-fbd5a5fda0ea", |
| 1196 | + "metadata": {}, |
| 1197 | + "outputs": [], |
| 1198 | + "source": [ |
| 1199 | + "nr_threads_per_block = 128\n", |
| 1200 | + "n = 10_000*nr_threads_per_block\n", |
| 1201 | + "a = np.random.uniform(size=n).astype(np.float32)\n", |
| 1202 | + "b = np.random.uniform(size=n).astype(np.float32)\n", |
| 1203 | + "expected = np.sqrt(a**2 + b**2)\n", |
| 1204 | + "out = np.empty_like(a)" |
| 1205 | + ] |
| 1206 | + }, |
| 1207 | + { |
| 1208 | + "cell_type": "code", |
| 1209 | + "execution_count": 85, |
| 1210 | + "id": "fc4145df-af45-4ea4-a5bd-36e4f2b54384", |
| 1211 | + "metadata": {}, |
| 1212 | + "outputs": [], |
| 1213 | + "source": [ |
| 1214 | + "@cuda.jit\n", |
| 1215 | + "def distance_kernel(a, b, out):\n", |
| 1216 | + " start = cuda.grid(1)\n", |
| 1217 | + " stride = cuda.gridsize(1)\n", |
| 1218 | + " for i in range(start, a.shape[0], stride):\n", |
| 1219 | + " out[i] = distance_dev(a[i], b[i])" |
| 1220 | + ] |
| 1221 | + }, |
| 1222 | + { |
| 1223 | + "cell_type": "code", |
| 1224 | + "execution_count": 89, |
| 1225 | + "id": "73688f2e-4c4c-447d-8e25-2dc363c73cf7", |
| 1226 | + "metadata": {}, |
| 1227 | + "outputs": [], |
| 1228 | + "source": [ |
| 1229 | + "distance_kernel[nr_blocks, nr_threads_per_block](a, b, out)" |
| 1230 | + ] |
| 1231 | + }, |
| 1232 | + { |
| 1233 | + "cell_type": "code", |
| 1234 | + "execution_count": 90, |
| 1235 | + "id": "3554a442-c69f-4556-a0ef-81f278cca7cf", |
| 1236 | + "metadata": {}, |
| 1237 | + "outputs": [], |
| 1238 | + "source": [ |
| 1239 | + "np.testing.assert_almost_equal(out, expected)" |
| 1240 | + ] |
| 1241 | + }, |
1151 | 1242 | {
|
1152 | 1243 | "cell_type": "markdown",
|
1153 | 1244 | "id": "7015ff19-38b6-4b9b-8e17-2bef649c3774",
|
|
1209 | 1300 | "source": [
|
1210 | 1301 | "Thanks to coalesced memory access, the performance is again improved."
|
1211 | 1302 | ]
|
| 1303 | + }, |
| 1304 | + { |
| 1305 | + "cell_type": "code", |
| 1306 | + "execution_count": 18, |
| 1307 | + "id": "da85580a-cb83-49f5-b6a5-f45a3972b532", |
| 1308 | + "metadata": {}, |
| 1309 | + "outputs": [], |
| 1310 | + "source": [ |
| 1311 | + "@cuda.jit\n", |
| 1312 | + "def map_2d_kernel(A):\n", |
| 1313 | + " idx = cuda.grid(1)\n", |
| 1314 | + " x, y = cuda.grid(2)\n", |
| 1315 | + " A[x, y] = idx" |
| 1316 | + ] |
| 1317 | + }, |
| 1318 | + { |
| 1319 | + "cell_type": "code", |
| 1320 | + "execution_count": 20, |
| 1321 | + "id": "b80132e5-3ebf-40f4-b4b2-2a919a3b16fd", |
| 1322 | + "metadata": {}, |
| 1323 | + "outputs": [], |
| 1324 | + "source": [ |
| 1325 | + "A = np.zeros((16, 16), dtype=np.int32)\n", |
| 1326 | + "nr_blocks = (1, 1)\n", |
| 1327 | + "nr_threads_per_block = (16, 16)" |
| 1328 | + ] |
| 1329 | + }, |
| 1330 | + { |
| 1331 | + "cell_type": "code", |
| 1332 | + "execution_count": 21, |
| 1333 | + "id": "52c803e6-7eee-4c93-b5ad-79d630ab4874", |
| 1334 | + "metadata": {}, |
| 1335 | + "outputs": [ |
| 1336 | + { |
| 1337 | + "name": "stderr", |
| 1338 | + "output_type": "stream", |
| 1339 | + "text": [ |
| 1340 | + "/home/gjb/miniconda3/envs/numba/lib/python3.9/site-packages/numba/cuda/compiler.py:865: NumbaPerformanceWarning: Grid size (1) < 2 * SM count (68) will likely result in GPU under utilization due to low occupancy.\n", |
| 1341 | + " warn(NumbaPerformanceWarning(msg))\n", |
| 1342 | + "/home/gjb/miniconda3/envs/numba/lib/python3.9/site-packages/numba/cuda/cudadrv/devicearray.py:790: NumbaPerformanceWarning: Host array used in CUDA kernel will incur copy overhead to/from device.\n", |
| 1343 | + " warn(NumbaPerformanceWarning(msg))\n" |
| 1344 | + ] |
| 1345 | + } |
| 1346 | + ], |
| 1347 | + "source": [ |
| 1348 | + "map_2d_kernel[nr_blocks, nr_threads_per_block](A)" |
| 1349 | + ] |
| 1350 | + }, |
| 1351 | + { |
| 1352 | + "cell_type": "code", |
| 1353 | + "execution_count": 22, |
| 1354 | + "id": "82a167e3-b6f2-46c2-b0a7-81cd6c64226a", |
| 1355 | + "metadata": {}, |
| 1356 | + "outputs": [ |
| 1357 | + { |
| 1358 | + "data": { |
| 1359 | + "text/plain": [ |
| 1360 | + "array([[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", |
| 1361 | + " [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", |
| 1362 | + " [ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],\n", |
| 1363 | + " [ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],\n", |
| 1364 | + " [ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],\n", |
| 1365 | + " [ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5],\n", |
| 1366 | + " [ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6],\n", |
| 1367 | + " [ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7],\n", |
| 1368 | + " [ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8],\n", |
| 1369 | + " [ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9],\n", |
| 1370 | + " [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10],\n", |
| 1371 | + " [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11],\n", |
| 1372 | + " [12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12],\n", |
| 1373 | + " [13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13],\n", |
| 1374 | + " [14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14],\n", |
| 1375 | + " [15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15]],\n", |
| 1376 | + " dtype=int32)" |
| 1377 | + ] |
| 1378 | + }, |
| 1379 | + "execution_count": 22, |
| 1380 | + "metadata": {}, |
| 1381 | + "output_type": "execute_result" |
| 1382 | + } |
| 1383 | + ], |
| 1384 | + "source": [ |
| 1385 | + "A" |
| 1386 | + ] |
| 1387 | + }, |
| 1388 | + { |
| 1389 | + "cell_type": "markdown", |
| 1390 | + "id": "65f42313-ad4e-47e5-ba52-025b31fe2362", |
| 1391 | + "metadata": {}, |
| 1392 | + "source": [ |
| 1393 | + "# Row versus column sum and memory access" |
| 1394 | + ] |
| 1395 | + }, |
| 1396 | + { |
| 1397 | + "cell_type": "code", |
| 1398 | + "execution_count": 6, |
| 1399 | + "id": "e7774f98-24d5-4865-a60f-209551fc912f", |
| 1400 | + "metadata": {}, |
| 1401 | + "outputs": [], |
| 1402 | + "source": [ |
| 1403 | + "n = 2**14\n", |
| 1404 | + "nr_threads_per_block = 128\n", |
| 1405 | + "nr_blocks = n//nr_threads_per_block" |
| 1406 | + ] |
| 1407 | + }, |
| 1408 | + { |
| 1409 | + "cell_type": "code", |
| 1410 | + "execution_count": 7, |
| 1411 | + "id": "58e0cb31-254a-42fd-ad04-dfa89e0f0482", |
| 1412 | + "metadata": {}, |
| 1413 | + "outputs": [], |
| 1414 | + "source": [ |
| 1415 | + "A = np.random.uniform(size=(n, n)).astype(np.float32)\n", |
| 1416 | + "A_dev = cuda.to_device(A)\n", |
| 1417 | + "sums = np.zeros((n, ), dtype=np.float32)\n", |
| 1418 | + "sums_dev = cuda.to_device(sums)" |
| 1419 | + ] |
| 1420 | + }, |
| 1421 | + { |
| 1422 | + "cell_type": "code", |
| 1423 | + "execution_count": 8, |
| 1424 | + "id": "cf3b9126-184c-4e00-9d0d-476726224d6b", |
| 1425 | + "metadata": {}, |
| 1426 | + "outputs": [], |
| 1427 | + "source": [ |
| 1428 | + "@cuda.jit\n", |
| 1429 | + "def row_sum_kernel(A, sums):\n", |
| 1430 | + " idx = cuda.grid(1)\n", |
| 1431 | + " row_sum = 0.0\n", |
| 1432 | + " for j in range(A.shape[1]):\n", |
| 1433 | + " row_sum += A[idx][j]\n", |
| 1434 | + " sums[idx] = row_sum" |
| 1435 | + ] |
| 1436 | + }, |
| 1437 | + { |
| 1438 | + "cell_type": "code", |
| 1439 | + "execution_count": 9, |
| 1440 | + "id": "debebeb5-77cb-4c47-b5a1-cd26cc1abbd9", |
| 1441 | + "metadata": {}, |
| 1442 | + "outputs": [], |
| 1443 | + "source": [ |
| 1444 | + "row_sum_kernel[nr_blocks, nr_threads_per_block](A_dev, sums_dev)" |
| 1445 | + ] |
| 1446 | + }, |
| 1447 | + { |
| 1448 | + "cell_type": "code", |
| 1449 | + "execution_count": 10, |
| 1450 | + "id": "704b7952-9b2c-4702-97fb-54978837b5b7", |
| 1451 | + "metadata": {}, |
| 1452 | + "outputs": [ |
| 1453 | + { |
| 1454 | + "data": { |
| 1455 | + "text/plain": [ |
| 1456 | + "array([8197.585 , 8185.74 , 8168.036 , ..., 8218.73 , 8190.8506,\n", |
| 1457 | + " 8219.317 ], dtype=float32)" |
| 1458 | + ] |
| 1459 | + }, |
| 1460 | + "execution_count": 10, |
| 1461 | + "metadata": {}, |
| 1462 | + "output_type": "execute_result" |
| 1463 | + } |
| 1464 | + ], |
| 1465 | + "source": [ |
| 1466 | + "sums_dev.copy_to_host()" |
| 1467 | + ] |
| 1468 | + }, |
| 1469 | + { |
| 1470 | + "cell_type": "code", |
| 1471 | + "execution_count": 16, |
| 1472 | + "id": "63fc33cf-a494-44f1-9e1f-6c864dcb6a65", |
| 1473 | + "metadata": {}, |
| 1474 | + "outputs": [ |
| 1475 | + { |
| 1476 | + "name": "stdout", |
| 1477 | + "output_type": "stream", |
| 1478 | + "text": [ |
| 1479 | + "15.4 ms ± 91.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" |
| 1480 | + ] |
| 1481 | + } |
| 1482 | + ], |
| 1483 | + "source": [ |
| 1484 | + "%timeit row_sum_kernel[nr_blocks, nr_threads_per_block](A_dev, sums_dev); cuda.synchronize()" |
| 1485 | + ] |
| 1486 | + }, |
| 1487 | + { |
| 1488 | + "cell_type": "code", |
| 1489 | + "execution_count": 12, |
| 1490 | + "id": "13f25c85-ba41-44c9-bc18-7156884e45ab", |
| 1491 | + "metadata": {}, |
| 1492 | + "outputs": [], |
| 1493 | + "source": [ |
| 1494 | + "@cuda.jit\n", |
| 1495 | + "def col_sum_kernel(A, sums):\n", |
| 1496 | + " idx = cuda.grid(1)\n", |
| 1497 | + " col_sum = 0.0\n", |
| 1498 | + " for i in range(A.shape[0]):\n", |
| 1499 | + " col_sum += A[i][idx]\n", |
| 1500 | + " sums[idx] = col_sum" |
| 1501 | + ] |
| 1502 | + }, |
| 1503 | + { |
| 1504 | + "cell_type": "code", |
| 1505 | + "execution_count": 13, |
| 1506 | + "id": "dba6c68b-9cca-4aa3-b89e-28157e3ab037", |
| 1507 | + "metadata": {}, |
| 1508 | + "outputs": [], |
| 1509 | + "source": [ |
| 1510 | + "col_sum_kernel[nr_blocks, nr_threads_per_block](A_dev, sums_dev)" |
| 1511 | + ] |
| 1512 | + }, |
| 1513 | + { |
| 1514 | + "cell_type": "code", |
| 1515 | + "execution_count": 14, |
| 1516 | + "id": "b13ca87d-8852-4446-9574-0b9a87e5d98c", |
| 1517 | + "metadata": {}, |
| 1518 | + "outputs": [ |
| 1519 | + { |
| 1520 | + "data": { |
| 1521 | + "text/plain": [ |
| 1522 | + "array([8220.059 , 8201.879 , 8192.549 , ..., 8229.558 , 8151.019 ,\n", |
| 1523 | + " 8127.3354], dtype=float32)" |
| 1524 | + ] |
| 1525 | + }, |
| 1526 | + "execution_count": 14, |
| 1527 | + "metadata": {}, |
| 1528 | + "output_type": "execute_result" |
| 1529 | + } |
| 1530 | + ], |
| 1531 | + "source": [ |
| 1532 | + "sums_dev.copy_to_host()" |
| 1533 | + ] |
| 1534 | + }, |
| 1535 | + { |
| 1536 | + "cell_type": "code", |
| 1537 | + "execution_count": 17, |
| 1538 | + "id": "32802150-78e9-453a-83ae-f8b16c8ac8a5", |
| 1539 | + "metadata": {}, |
| 1540 | + "outputs": [ |
| 1541 | + { |
| 1542 | + "name": "stdout", |
| 1543 | + "output_type": "stream", |
| 1544 | + "text": [ |
| 1545 | + "5.31 ms ± 20.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" |
| 1546 | + ] |
| 1547 | + } |
| 1548 | + ], |
| 1549 | + "source": [ |
| 1550 | + "%timeit col_sum_kernel[nr_blocks, nr_threads_per_block](A_dev, sums_dev); cuda.synchronize()" |
| 1551 | + ] |
| 1552 | + }, |
| 1553 | + { |
| 1554 | + "cell_type": "code", |
| 1555 | + "execution_count": null, |
| 1556 | + "id": "29f89a63-c827-484a-98dc-7d81311bd25f", |
| 1557 | + "metadata": {}, |
| 1558 | + "outputs": [], |
| 1559 | + "source": [] |
1212 | 1560 | }
|
1213 | 1561 | ],
|
1214 | 1562 | "metadata": {
|
|
0 commit comments