Skip to content

Commit 026693e

Browse files
committed
Add section on device functions
1 parent 36ed68f commit 026693e

File tree

1 file changed

+349
-1
lines changed

1 file changed

+349
-1
lines changed

source-code/gpu/numba.ipynb

Lines changed: 349 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,12 @@
1010
},
1111
{
1212
"cell_type": "code",
13-
"execution_count": 2,
13+
"execution_count": 60,
1414
"id": "9811bf38-83f3-4f27-a899-8ebe823977cb",
1515
"metadata": {},
1616
"outputs": [],
1717
"source": [
18+
"import itertools\n",
1819
"import math\n",
1920
"import matplotlib.pyplot as plt\n",
2021
"%matplotlib inline\n",
@@ -1148,6 +1149,96 @@
11481149
"Giving threads more work improves the performance by a factor of 15."
11491150
]
11501151
},
1152+
{
1153+
"cell_type": "markdown",
1154+
"id": "5f81b104-489d-4368-a54d-695ba3c2e04e",
1155+
"metadata": {
1156+
"tags": []
1157+
},
1158+
"source": [
1159+
"# Device functions"
1160+
]
1161+
},
1162+
{
1163+
"cell_type": "markdown",
1164+
"id": "a2cc724a-5058-46a5-85d8-e56be4e45b1e",
1165+
"metadata": {},
1166+
"source": [
1167+
"Device functions can be easily defined using the optional `device` argument in the decorator. Such functions can only be called from kernels or other device functions, never from the host."
1168+
]
1169+
},
1170+
{
1171+
"cell_type": "code",
1172+
"execution_count": 81,
1173+
"id": "15ca4d83-8bdc-480a-bf5a-3bc418b2ba81",
1174+
"metadata": {},
1175+
"outputs": [],
1176+
"source": [
1177+
"@cuda.jit(device=True)\n",
1178+
"def distance_dev(a, b):\n",
1179+
" return math.sqrt(a**2 + b**2)"
1180+
]
1181+
},
1182+
{
1183+
"cell_type": "code",
1184+
"execution_count": 71,
1185+
"id": "3cb16af4-4cdc-4593-bb70-b74797190890",
1186+
"metadata": {},
1187+
"outputs": [],
1188+
"source": [
1189+
"candidates = np.array(list(itertools.permutations(range(1, 10))), dtype=np.int64)"
1190+
]
1191+
},
1192+
{
1193+
"cell_type": "code",
1194+
"execution_count": 88,
1195+
"id": "7bb4eb11-bfd3-42ab-9ffd-fbd5a5fda0ea",
1196+
"metadata": {},
1197+
"outputs": [],
1198+
"source": [
1199+
"nr_threads_per_block = 128\n",
1200+
"n = 10_000*nr_threads_per_block\n",
1201+
"a = np.random.uniform(size=n).astype(np.float32)\n",
1202+
"b = np.random.uniform(size=n).astype(np.float32)\n",
1203+
"expected = np.sqrt(a**2 + b**2)\n",
1204+
"out = np.empty_like(a)"
1205+
]
1206+
},
1207+
{
1208+
"cell_type": "code",
1209+
"execution_count": 85,
1210+
"id": "fc4145df-af45-4ea4-a5bd-36e4f2b54384",
1211+
"metadata": {},
1212+
"outputs": [],
1213+
"source": [
1214+
"@cuda.jit\n",
1215+
"def distance_kernel(a, b, out):\n",
1216+
" start = cuda.grid(1)\n",
1217+
" stride = cuda.gridsize(1)\n",
1218+
" for i in range(start, a.shape[0], stride):\n",
1219+
" out[i] = distance_dev(a[i], b[i])"
1220+
]
1221+
},
1222+
{
1223+
"cell_type": "code",
1224+
"execution_count": 89,
1225+
"id": "73688f2e-4c4c-447d-8e25-2dc363c73cf7",
1226+
"metadata": {},
1227+
"outputs": [],
1228+
"source": [
1229+
"distance_kernel[nr_blocks, nr_threads_per_block](a, b, out)"
1230+
]
1231+
},
1232+
{
1233+
"cell_type": "code",
1234+
"execution_count": 90,
1235+
"id": "3554a442-c69f-4556-a0ef-81f278cca7cf",
1236+
"metadata": {},
1237+
"outputs": [],
1238+
"source": [
1239+
"np.testing.assert_almost_equal(out, expected)"
1240+
]
1241+
},
11511242
{
11521243
"cell_type": "markdown",
11531244
"id": "7015ff19-38b6-4b9b-8e17-2bef649c3774",
@@ -1209,6 +1300,263 @@
12091300
"source": [
12101301
"Thanks to coalesced memory access, the performance is again improved."
12111302
]
1303+
},
1304+
{
1305+
"cell_type": "code",
1306+
"execution_count": 18,
1307+
"id": "da85580a-cb83-49f5-b6a5-f45a3972b532",
1308+
"metadata": {},
1309+
"outputs": [],
1310+
"source": [
1311+
"@cuda.jit\n",
1312+
"def map_2d_kernel(A):\n",
1313+
" idx = cuda.grid(1)\n",
1314+
" x, y = cuda.grid(2)\n",
1315+
" A[x, y] = idx"
1316+
]
1317+
},
1318+
{
1319+
"cell_type": "code",
1320+
"execution_count": 20,
1321+
"id": "b80132e5-3ebf-40f4-b4b2-2a919a3b16fd",
1322+
"metadata": {},
1323+
"outputs": [],
1324+
"source": [
1325+
"A = np.zeros((16, 16), dtype=np.int32)\n",
1326+
"nr_blocks = (1, 1)\n",
1327+
"nr_threads_per_block = (16, 16)"
1328+
]
1329+
},
1330+
{
1331+
"cell_type": "code",
1332+
"execution_count": 21,
1333+
"id": "52c803e6-7eee-4c93-b5ad-79d630ab4874",
1334+
"metadata": {},
1335+
"outputs": [
1336+
{
1337+
"name": "stderr",
1338+
"output_type": "stream",
1339+
"text": [
1340+
"/home/gjb/miniconda3/envs/numba/lib/python3.9/site-packages/numba/cuda/compiler.py:865: NumbaPerformanceWarning: Grid size (1) < 2 * SM count (68) will likely result in GPU under utilization due to low occupancy.\n",
1341+
" warn(NumbaPerformanceWarning(msg))\n",
1342+
"/home/gjb/miniconda3/envs/numba/lib/python3.9/site-packages/numba/cuda/cudadrv/devicearray.py:790: NumbaPerformanceWarning: Host array used in CUDA kernel will incur copy overhead to/from device.\n",
1343+
" warn(NumbaPerformanceWarning(msg))\n"
1344+
]
1345+
}
1346+
],
1347+
"source": [
1348+
"map_2d_kernel[nr_blocks, nr_threads_per_block](A)"
1349+
]
1350+
},
1351+
{
1352+
"cell_type": "code",
1353+
"execution_count": 22,
1354+
"id": "82a167e3-b6f2-46c2-b0a7-81cd6c64226a",
1355+
"metadata": {},
1356+
"outputs": [
1357+
{
1358+
"data": {
1359+
"text/plain": [
1360+
"array([[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
1361+
" [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
1362+
" [ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],\n",
1363+
" [ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],\n",
1364+
" [ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],\n",
1365+
" [ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5],\n",
1366+
" [ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6],\n",
1367+
" [ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7],\n",
1368+
" [ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8],\n",
1369+
" [ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9],\n",
1370+
" [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10],\n",
1371+
" [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11],\n",
1372+
" [12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12],\n",
1373+
" [13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13],\n",
1374+
" [14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14],\n",
1375+
" [15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15]],\n",
1376+
" dtype=int32)"
1377+
]
1378+
},
1379+
"execution_count": 22,
1380+
"metadata": {},
1381+
"output_type": "execute_result"
1382+
}
1383+
],
1384+
"source": [
1385+
"A"
1386+
]
1387+
},
1388+
{
1389+
"cell_type": "markdown",
1390+
"id": "65f42313-ad4e-47e5-ba52-025b31fe2362",
1391+
"metadata": {},
1392+
"source": [
1393+
"# Row versus column sum and memory access"
1394+
]
1395+
},
1396+
{
1397+
"cell_type": "code",
1398+
"execution_count": 6,
1399+
"id": "e7774f98-24d5-4865-a60f-209551fc912f",
1400+
"metadata": {},
1401+
"outputs": [],
1402+
"source": [
1403+
"n = 2**14\n",
1404+
"nr_threads_per_block = 128\n",
1405+
"nr_blocks = n//nr_threads_per_block"
1406+
]
1407+
},
1408+
{
1409+
"cell_type": "code",
1410+
"execution_count": 7,
1411+
"id": "58e0cb31-254a-42fd-ad04-dfa89e0f0482",
1412+
"metadata": {},
1413+
"outputs": [],
1414+
"source": [
1415+
"A = np.random.uniform(size=(n, n)).astype(np.float32)\n",
1416+
"A_dev = cuda.to_device(A)\n",
1417+
"sums = np.zeros((n, ), dtype=np.float32)\n",
1418+
"sums_dev = cuda.to_device(sums)"
1419+
]
1420+
},
1421+
{
1422+
"cell_type": "code",
1423+
"execution_count": 8,
1424+
"id": "cf3b9126-184c-4e00-9d0d-476726224d6b",
1425+
"metadata": {},
1426+
"outputs": [],
1427+
"source": [
1428+
"@cuda.jit\n",
1429+
"def row_sum_kernel(A, sums):\n",
1430+
" idx = cuda.grid(1)\n",
1431+
" row_sum = 0.0\n",
1432+
" for j in range(A.shape[1]):\n",
1433+
" row_sum += A[idx][j]\n",
1434+
" sums[idx] = row_sum"
1435+
]
1436+
},
1437+
{
1438+
"cell_type": "code",
1439+
"execution_count": 9,
1440+
"id": "debebeb5-77cb-4c47-b5a1-cd26cc1abbd9",
1441+
"metadata": {},
1442+
"outputs": [],
1443+
"source": [
1444+
"row_sum_kernel[nr_blocks, nr_threads_per_block](A_dev, sums_dev)"
1445+
]
1446+
},
1447+
{
1448+
"cell_type": "code",
1449+
"execution_count": 10,
1450+
"id": "704b7952-9b2c-4702-97fb-54978837b5b7",
1451+
"metadata": {},
1452+
"outputs": [
1453+
{
1454+
"data": {
1455+
"text/plain": [
1456+
"array([8197.585 , 8185.74 , 8168.036 , ..., 8218.73 , 8190.8506,\n",
1457+
" 8219.317 ], dtype=float32)"
1458+
]
1459+
},
1460+
"execution_count": 10,
1461+
"metadata": {},
1462+
"output_type": "execute_result"
1463+
}
1464+
],
1465+
"source": [
1466+
"sums_dev.copy_to_host()"
1467+
]
1468+
},
1469+
{
1470+
"cell_type": "code",
1471+
"execution_count": 16,
1472+
"id": "63fc33cf-a494-44f1-9e1f-6c864dcb6a65",
1473+
"metadata": {},
1474+
"outputs": [
1475+
{
1476+
"name": "stdout",
1477+
"output_type": "stream",
1478+
"text": [
1479+
"15.4 ms ± 91.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
1480+
]
1481+
}
1482+
],
1483+
"source": [
1484+
"%timeit row_sum_kernel[nr_blocks, nr_threads_per_block](A_dev, sums_dev); cuda.synchronize()"
1485+
]
1486+
},
1487+
{
1488+
"cell_type": "code",
1489+
"execution_count": 12,
1490+
"id": "13f25c85-ba41-44c9-bc18-7156884e45ab",
1491+
"metadata": {},
1492+
"outputs": [],
1493+
"source": [
1494+
"@cuda.jit\n",
1495+
"def col_sum_kernel(A, sums):\n",
1496+
" idx = cuda.grid(1)\n",
1497+
" col_sum = 0.0\n",
1498+
" for i in range(A.shape[0]):\n",
1499+
" col_sum += A[i][idx]\n",
1500+
" sums[idx] = col_sum"
1501+
]
1502+
},
1503+
{
1504+
"cell_type": "code",
1505+
"execution_count": 13,
1506+
"id": "dba6c68b-9cca-4aa3-b89e-28157e3ab037",
1507+
"metadata": {},
1508+
"outputs": [],
1509+
"source": [
1510+
"col_sum_kernel[nr_blocks, nr_threads_per_block](A_dev, sums_dev)"
1511+
]
1512+
},
1513+
{
1514+
"cell_type": "code",
1515+
"execution_count": 14,
1516+
"id": "b13ca87d-8852-4446-9574-0b9a87e5d98c",
1517+
"metadata": {},
1518+
"outputs": [
1519+
{
1520+
"data": {
1521+
"text/plain": [
1522+
"array([8220.059 , 8201.879 , 8192.549 , ..., 8229.558 , 8151.019 ,\n",
1523+
" 8127.3354], dtype=float32)"
1524+
]
1525+
},
1526+
"execution_count": 14,
1527+
"metadata": {},
1528+
"output_type": "execute_result"
1529+
}
1530+
],
1531+
"source": [
1532+
"sums_dev.copy_to_host()"
1533+
]
1534+
},
1535+
{
1536+
"cell_type": "code",
1537+
"execution_count": 17,
1538+
"id": "32802150-78e9-453a-83ae-f8b16c8ac8a5",
1539+
"metadata": {},
1540+
"outputs": [
1541+
{
1542+
"name": "stdout",
1543+
"output_type": "stream",
1544+
"text": [
1545+
"5.31 ms ± 20.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
1546+
]
1547+
}
1548+
],
1549+
"source": [
1550+
"%timeit col_sum_kernel[nr_blocks, nr_threads_per_block](A_dev, sums_dev); cuda.synchronize()"
1551+
]
1552+
},
1553+
{
1554+
"cell_type": "code",
1555+
"execution_count": null,
1556+
"id": "29f89a63-c827-484a-98dc-7d81311bd25f",
1557+
"metadata": {},
1558+
"outputs": [],
1559+
"source": []
12121560
}
12131561
],
12141562
"metadata": {

0 commit comments

Comments
 (0)