karimsayedre.github.io/RTIOW.html at master · karimsayedre/karimsayedre.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html lang="en">
<meta name="viewport" content="width=device-width, initial-scale=1.0">

<head>
    <title>Ray Tracing In One Weekend In CUDA</title>
    <meta charset="UTF-8">

    <!-- General Meta -->
    <meta name="description" content="A showcase of my projects and portfolio.">
    <link rel="icon" href="icons/Beyond.png">

    <!-- Open Graph Meta (for Facebook, LinkedIn, etc.) -->
    <meta property="og:title" content="Karim Sayed - Rendering Engineer">
    <meta property="og:description" content="A showcase of my projects and portfolio.">
    <meta property="og:image"
        content="https://karimsayedre.github.io/images/RTIOW/2560x1440_50depth_3000samples_3400ms.png">
    <meta property="og:url" content="https://karimsayedre.github.io/">
    <meta property="og:type" content="website">

    <!-- Twitter Card Meta -->
    <meta name="twitter:card" content="summary_large_image">
    <meta name="twitter:title" content="Karim Sayed - Rendering Engineer">
    <meta name="twitter:description" content="A showcase of my projects and portfolio.">
    <meta name="twitter:image"
        content="https://karimsayedre.github.io/images/RTIOW/2560x1440_50depth_3000samples_3400ms.png">

    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"></script>
    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet"
        integrity="sha384-QWTKZyjpPEjISv5WaRU9OFeRpok6YctnYmDr5pNlyT2bRjXh0JMhjY6hW+ALEwIH" crossorigin="anonymous">

    <link rel="stylesheet" href="style/style.css">
    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>

    <script src="https://cdn.jsdelivr.net/npm/bootstrap@4.6.2/dist/js/bootstrap.bundle.min.js"
        integrity="sha384-Fy6S3B9q64WdZWQUiU+q4/2Lc9npb8tCaSX9FK7E8HnRr0Jz8D6OP9dO5Vg3Q9ct"
        crossorigin="anonymous"></script>

    <script src="scripts/images.js"></script>
    <script src="scripts/behaviour.js"></script>
    <script src="scripts/bars.js"></script>

    <link rel="preconnect" href="https://fonts.googleapis.com">
    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
    <link
        href="https://fonts.googleapis.com/css2?family=Roboto:ital,wght@0,100;0,300;0,400;0,500;0,700;0,900;1,100;1,300;1,400;1,500;1,700;1,900&display=swap"
        rel="stylesheet">

    <link rel="stylesheet"
        href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.11.1/styles/atom-one-dark.css" />
    <!-- Highlight.js CSS theme -->
    <link rel="stylesheet"
        href="https://cdnjs.cloudflare.com/ajax/libs/highlightjs-line-numbers.js/11.11.1/styles/line-numbers.min.css" />

    <!-- Highlight.js library -->
    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.11.1/highlight.min.js"></script>
    <link href="https://fonts.googleapis.com/css2?family=Fira+Code&display=swap" rel="stylesheet">

</head>


<body>
    <div id="top"></div>

    <script>
        navbar();
    </script>

    <div class="container">

        <article>
            <div class="collapsible">
                <h1>CUDA Ray Tracing 2x Faster Than RTX: My CUDA Ray Tracing Journey</h1>
                <!-- <p><em>Note: This is a draft version. Final edits are still in progress. Feedback is welcome while final
                        edits are underway.</em></p> -->
            </div>

            <img class="photo" src="images/RTIOW/2560x1440_50depth_3000samples_3400ms.png"
                alt="Ray Tracing in One Weekend render with 50 depth, 3000 samples, and 3400ms render time">

            <section class="section-header">
                <h2>Introduction</h2>

                <p>
                    Welcome! This article is a deep dive into how I made a CUDA-based ray tracer that outperforms a
                    Vulkan/RTX implementation—sometimes by more than 3x—on the same hardware. If you're interested in
                    GPU programming, performance optimization, or just want to see how far you can push a path tracer,
                    you're in the right place.
                </p>
                <p>
                    The comparison is with <a href="https://github.com/GPSnoopy/RayTracingInVulkan" target="_blank"
                        rel="noopener noreferrer">RayTracingInVulkan</a> by GPSnoopy, a well-known Vulkan/RTX renderer.
                    My goal
                    wasn't just to port <em>Ray Tracing in One Weekend</em> to CUDA, but to squeeze every last
                    millisecond out of it—profiling, analyzing, and optimizing until the numbers surprised even me.
                    And this is actually how I learned CUDA.
                </p>
                <p>
                    In this write-up, I'll walk you through the journey: what worked, what didn't, and the key
                    tricks that made the biggest difference. Whether you're a graphics programmer, a CUDA
                    enthusiast, or just curious about real-world GPU optimization, I hope you'll find something useful
                    here.
                </p>

                <div class="gotcha-card pro-tip">
                    <div class="gotcha-marker pro-tip-marker"></div>
                    <div class="gotcha-content">
                        <h4>Note</h4>
                        <p>
                            The original title claimed a 3.6x speedup, which was true at the time of writing —
                            but after
                            realizing
                            I forgot to add Russian Roulette to RayTracingInVulkan, the performance difference shrunk to
                            2x.
                            Still very significant, and it's more fair now.
                        </p>
                    </div>
                </div>


                <div class="perf-table-container">
                    <div class="perf-table-container">
                        <table class="perf-table glow-table">
                            <thead>
                                <tr>
                                    <th>Renderer</th>
                                    <th>Graphics API</th>
                                    <th>Hardware Acceleration</th>
                                    <th>Geometry Types</th>
                                    <th>Performance (FPS)</th>
                                    <th>GPU Time</th>
                                    <th>Notes</th>
                                </tr>
                            </thead>
                            <tbody>
                                <tr>
                                    <td class="spec-value"><a href="https://github.com/GPSnoopy/RayTracingInVulkan"
                                            rel="noopener noreferrer" target="_blank">RayTracingInVulkan</a> (GPSnoopy)
                                    </td>
                                    <td class="spec-value">Vulkan</td>
                                    <td class="spec-value">RTX acceleration</td>
                                    <td class="spec-value">Procedural sphere tracing + triangle modes</td>
                                    <td class="spec-value fps-highlight">~20 ms</td>

                                    <td class="spec-value fps-highlight">~50 FPS</td>
                                    <td class="spec-value">
                                        <ul>
                                            <li>Added russian roulette for a fair comparison</li>
                                            <li>No acceleration structure compaction</li>
                                            <li>Using procedural AABBs per sphere</li>
                                            <li>Using ray tracing pipeline (no inline ray tracing)</li>

                                        </ul>
                                    </td>
                                </tr>
                                <tr>
                                    <td class="spec-value"><a
                                            href="https://github.com/karimsayedre/CUDA-Ray-Tracing-In-One-Weekend"
                                            rel="noopener noreferrer"
                                            target="_blank">CUDA-Ray-Tracing-In-One-Weekend</a>(Mine)</td>
                                    <td class="spec-value">CUDA</td>
                                    <td class="spec-value">No hardware RT cores</td>
                                    <td class="spec-value">Procedural spheres only</td>
                                    <td class="spec-value fps-highlight">~8 ms</td>
                                    <td class="spec-value fps-highlight">105 FPS</td>
                                    <td class="spec-value">
                                        <ul>
                                            <li>Same resolution and settings</li>
                                            <li>Different sphere locations and materials</li>
                                            <li>Implements what we call "inline ray tracing" (without hardware RT
                                                pipeline,
                                                though)</li>
                                        </ul>
                                    </td>
                                </tr>
                            </tbody>
                        </table>
                    </div>
                    <p>
                        Why is the Vulkan/RTX version slower? While there are many contributing factors, one likely
                        explanation—pointed out by <strong>Tanguy Fautré (GPSnoopy)</strong>—the author of <a
                            href="https://github.com/GPSnoopy/RayTracingInVulkan" target="_blank"
                            rel="noopener noreferrer">RayTracingInVulkan</a>, shared his insights on why
                        procedural ray tracing may underperform on NVIDIA RTX GPUs:
                    </p>
                    <blockquote class="quote">
                        “My suspicion is that procedural spheres are relatively cheap to compute (both the ray
                        intersection and shading), leaving the compute units mostly idling while the RT units are
                        fully
                        utilized doing BVH traversal. Thus the performance in this case is entirely limited by the
                        RT
                        units.
                        <br>
                        <br>
                        Interestingly, this article (and the Radeon RX 6900 XT results in RayTracingInVulkan
                        procedural
                        benchmarks, a GPU where the BVH traversal is handled by the compute units rather than its RT
                        units) tend to support the idea that doing the entire BVH traversal using only the compute
                        units
                        is faster than delegating to the RT units. At least on the GeForce 3000 series and the
                        Radeon RX
                        6000 series, that is.
                        <br>
                        <br>

                        In practice, the test scene is an unlikely scenario in gaming. In a modern AAA game, the
                        compute
                        cores will be actively used for shading and rendering the game, leaving little room on those
                        units for doing the BVH traversal, while most (all?) of the ray intersections will be done
                        against triangles (a task at which RT units excel, especially on later generation GPUs).”
                        <br>
                        <span class="quote-author">
                            - Tanguy Fautré (GPSnoopy)
                        </span>
                    </blockquote>


                    <p>
                        Supporting this theory, <strong>RayTracingInVulkan</strong> consistently benchmarks better
                        on
                        AMD
                        cards, such as the Radeon RX 6900 XT, which perform BVH traversal using compute units rather
                        than
                        dedicated RT hardware. This suggests that—at least on NVIDIA's 3000 series and AMD's 6000
                        series—doing everything in compute can outperform using fixed-function RT cores when the
                        workload
                        involves minimal shading and simple procedural intersections.
                    </p>
                    <p>
                        This also ties directly into NVIDIA's own guidance, which emphasizes that RT cores are
                        architected to
                        be most efficient
                        with triangle geometry—not procedural primitives like spheres or AABBs:
                    </p>
                    <blockquote class="quote">
                        “Use triangles over AABBs. RTX GPUs excel in accelerating traversal of AS created from
                        triangle
                        geometry.”
                        <br>
                        <span class="quote-author">
                            – <a href="https://developer.nvidia.com/blog/best-practices-for-using-nvidia-rtx-ray-tracing-updated/"
                                target="_blank" rel="noopener noreferrer">NVIDIA Developer Blog</a>
                        </span>
                    </blockquote>
                    <p>
                        Of course, this is a synthetic scenario. In a typical AAA game, compute cores are heavily
                        loaded
                        with shading and post-processing tasks, and most ray intersections are against triangles—a
                        case
                        where RT cores excel, especially on newer generations of GPUs.
                    </p>

                    <p>
                        Another reason might be the ray tracing pipeline itself. While powerful and flexible, the
                        hardware
                        RT pipeline often incurs more overhead than inline ray tracing (Ray query). It tends to make
                        heavy use of VRAM bandwidth by moving payload
                        data around between shader stages. On the other hand, inline ray tracing can keep most of
                        the
                        data
                        in registers, which is exactly what's happening in my implementation. So you can consider my
                        approach as <strong>inline ray tracing</strong>
                        This register-centric design drastically cuts down memory traffic and boosts performance.
                    </p>

                    <p>
                        So yes, it may sound like clickbait—but it's <em>technically</em> accurate, and when you dig
                        into
                        sample rates, shader complexity, geometry types, and hardware, the numbers hold up. In this
                        article,
                        I'll peel back the layers of how I squeezed 2x performance out through CUDA-level
                        optimizations,
                        giving you an exciting taste of what's possible when you really dig deep into cache
                        behavior,
                        register pressure, and GPU optimization.
                    </p>

                    <h3> Why CUDA?</h3>
                    <p>
                        As a graphics programmer, I'm constantly pushing the limits of what the GPU can do. But I
                        realized
                        that knowing just high-level shading languages or APIs like Vulkan or DirectX wasn't
                        enough—I
                        needed
                        to understand the machine itself. CUDA gave me the lowest-level, most explicit way to
                        explore
                        how
                        GPUs schedule threads, manage memory, and hit (or miss) performance targets. And with the
                        help
                        of
                        <strong>Nsight Compute</strong>, I wasn't just reading theory—I was hands-on, exploring real
                        bottlenecks, discovering how latency hiding works, learning about warp scheduling, cache
                        behavior,
                        and so much more. It introduced me to performance concepts I hadn't encountered before, and
                        grounded
                        them in actual numbers and experimentation.
                    </p>

                    <p>And I didn't want to "just learn a language." I wanted to <strong>learn CUDA as a suite of
                            tools</strong>, to
                        really get under the hood of how GPU code runs, stalls, and gets optimized. So I asked
                        myself:
                        what's the best way to do that for a graphics programmer?
                    </p>

                    <p><strong>Answer:</strong> write a ray tracer from scratch in CUDA… and then squeeze it until
                        it
                        screams.</p>

                    <p>This article walks you through how I implemented a naive CUDA port of <em>Ray Tracing in One
                            Weekend</em>
                        that
                        ran at <strong>2.5 seconds per frame</strong>, and optimized it down to <strong>9
                            milliseconds</strong>. Along the way, I hit every wall I could—scoreboard stalls,
                        branching
                        hell,
                        memory layout issues—and learned how to knock each one down.</p>

                    <p>This isn't a language learning blog. It's an <strong>optimization story</strong>. A journey
                        into
                        how
                        GPUs
                        really work, and what it takes to make them fly.</p>

                    <p>And if you're into ray tracing, performance hacking, or just enjoy watching frame times
                        drop—you're
                        in
                        the right place.</p>

                    <p> You can check out the source code along with it's commit history <a
                            href="https://github.com/karimsayedre/CUDA-Ray-Tracing-In-One-Weekend" target="_blank"
                            rel="noopener noreferrer">HERE</a>.
                    </p>

                    <h3>Specifications:</h3>
                    <p>
                        To give proper context to the performance numbers and optimizations discussed in this
                        article,
                        it's
                        important to understand the hardware I tested on. These specs shaped not only what was
                        possible,
                        but
                        also where the real bottlenecks and wins emerged during tuning.
                    </p>
                    <div class="perf-table-container">
                        <table class="perf-table glow-table">
                            <thead>
                                <tr>
                                    <th>Component</th>
                                    <th>Specification</th>
                                </tr>
                            </thead>
                            <tbody>
                                <tr>
                                    <td>CPU</td>
                                    <td class="spec-value">i5 13600KF</td>
                                </tr>
                                <tr>
                                    <td>GPU</td>
                                    <td class="spec-value">RTX 3080 10GB Desktop</td>
                                </tr>
                                <tr>
                                    <td>CUDA Version</td>
                                    <td class="spec-value">12.9</td>
                                </tr>
                                <tr>
                                    <td>Resolution</td>
                                    <td class="spec-value">720x1280</td>
                                </tr>
                                <tr>
                                    <td>Samples</td>
                                    <td class="spec-value">30</td>
                                </tr>
                                <tr>
                                    <td>Max Ray Depth</td>
                                    <td class="spec-value">50</td>
                                </tr>
                            </tbody>
                        </table>
                    </div>

                    <h3>The Starting Point: A Naive CUDA Ray Tracer</h3>

                    <p>Before any optimizations, I started with a direct CUDA port of <em>Ray Tracing in One
                            Weekend</em>.
                        No
                        fancy tricks — just threads launching per pixel, tracing rays recursively <strong> plus
                            traversing a
                            BVH, so yes, we're already not even as slow as big O of N here.</strong></p>

                    <p>And it worked. Technically. But it was slow — <strong>2.5 seconds per frame</strong> kind of
                        slow,
                        slower
                        than my old CPU version which was 1.5 seconds. Each
                        thread handled one pixel, there was no memory layout optimization, and no thought given
                        to how branching or recursion would behave on the GPU.</p>

                    <p>This was intentional. I wanted to <strong>start <i>almost</i> from zero</strong>, actually,
                        from
                        where I
                        thought was fast last time I tried to optimize it :)</p>

                    <p>So with a chunky frame time and profiler in hand, I started breaking it down. Where was the
                        time
                        going?
                        What was stalling? Why did a GPU that could chew through teraflops look like it was running
                        on a
                        potato?
                    </p>

                    <p>Time to find out ... But first...</p>

                    <h3>What CUDA Gives You (and What It Punishes You For)</h3>

                    <p>CUDA is amazing because it gives you <strong>bare-metal control</strong> over how your code
                        runs on
                        the
                        GPU. You're not writing shader code inside an engine or hoping a compiler figures things out
                        —
                        you're
                        the compiler. You're the scheduler. You're the reason your app runs fast... or doesn't.</p>

                    <p>But with that power comes the traps. And the first trap I stepped into was
                        <strong>recursion</strong>.
                    </p>

                    <p>Recursion on the GPU sounds elegant — until you realize it's <strong>kryptonite for
                            performance</strong>.
                        Why?</p>
                    <ul>
                        <li><strong>Register pressure:</strong> every level of recursion eats more registers, and
                            once
                            you're
                            out, you're spilling to memory.</li>
                        <li><strong>Local memory access:</strong> spilled data goes to local memory, which is slow,
                            and you
                            don't get to control the layout.</li>
                        <li><strong>Stack usage:</strong> recursive calls build a big stack, and that stack sits in
                            memory,
                            not
                            registers.</li>
                        <li><strong>Warp divergence:</strong> recursion usually means branching, and branching
                            destroys SIMT
                            efficiency.</li>
                    </ul>

                    <p>Next mistake? I thought about trying inheritance for materials and objects. Turns out
                        <strong>virtual
                            calls and dynamic polymorphism</strong> are not CUDA's friends. Even if it compiles, the
                        cost is
                        brutal. You could go for <strong>static polymorphism</strong> (templates or CRTP), but that
                        starts
                        to
                        bloat code size fast — and I honestly didn't push it far enough to know if the tradeoff was
                        worth
                        it.
                    </p>

                    <p>On a brighter note, if you're coming from C++ graphics work, you'll be happy to know that
                        <strong>GLM
                            works with CUDA</strong>. I used it throughout the project, and the performance hit was
                        negligible —
                        way better than writing custom vector/matrix types from scratch.
                    </p>

                    <p>Bottom line: CUDA gives you tools to go fast, but it doesn't forgive bad habits from CPU
                        land. You
                        have
                        to think like the GPU... SIMT, parallel, latency hiding — or suffer.</p>


                    <h3>Register Pressure: The Silent Killer of GPU Performance</h3>

                    <p>One of the first things I had to come to terms with in CUDA is that <strong>registers are
                            everything</strong>. They're the fastest memory the GPU has, and CUDA tries to keep as
                        much data
                        in
                        them as possible. But once you run out, you're in trouble.</p>

                    <p><strong>Register pressure</strong> happens when your kernel uses too many registers per
                        thread.
                        Sounds
                        innocent, but it can kill performance in more than one way:</p>

                    <ul>
                        <li><strong>Lower occupancy:</strong> Each Streaming Multiprocessor (SM) has a limited
                            number of
                            registers. If your kernel uses too many per thread, fewer threads can run at once,
                            lowering
                            occupancy and throughput.</li>
                        <li><strong>Spilling to local memory:</strong> When the compiler can't fit everything in
                            registers,
                            it
                            spills to local memory — which lives in global memory space. That's a huge latency hit.
                        </li>
                        <li><strong>Instruction stalls:</strong> Excessive register usage can increase instruction
                            dependencies
                            and limit ILP (instruction-level parallelism), causing more stalls even within a warp.
                        </li>
                    </ul>

                    <p>So, how do you know if register pressure is too high?</p>

                    <ul>
                        <li><strong>Profiler tells you:</strong> Nsight Compute and Nsight Systems will show
                            register count,
                            occupancy, and spill stores/loads. If you're seeing spill activity, you're over budget.
                        </li>
                        <li><strong>Occupancy below expected levels:</strong> If you're running a small kernel but
                            seeing
                            25-50%
                            occupancy, it's a red flag. Check the register usage per thread.</li>
                        <li><strong>Nsight Compute: </strong> it actually tells you! </li>
                    </ul>


                    <div class="image-container section-header" data-preview="true">
                        <h3>Pro Tip</h3>

                        <img src="images/RTIOW/Screenshot 2025-06-13 193124.png" class="preview-image"
                            alt="CUDA Scheduler Performance">

                        <div class="gotcha-card">
                            <div class="gotcha-marker pro-tip-marker"></div>

                            <div class="image-comments">
                                <h4>Always compile with <code>-Xptxas=-v</code></h4>
                                <p>This will show information about each compiled function-how many register? how
                                    many bytes
                                    spilled
                                    to memory, how big is the stack frame?
                                </p>
                                <h5>Use Nsight Compute's built-in occupancy calculator!</h5>
                                <p>This is <strong> really</strong> useful, you give information about your kernel,
                                    it tells
                                    you
                                    what's actually limiting your occupancy, neat!</p>
                            </div>
                        </div>
                    </div>


                    <p>In my case, recursion was the big offender — each level of recursion held ray state,
                        intersection
                        info,
                        and more. Once I removed recursion and moved to an explicit stack in registers, I gained
                        control. I
                        could reuse memory, limit stack depth, and avoid unnecessary spills.</p>

                    <p>If you want your GPU code to fly, managing register pressure is a must. You're always
                        balancing
                        performance against code clarity and flexibility — and in CUDA, it's better to stay lean.
                    </p>
            </section>

            <section class="section-header">

                <h2 class="optimization-title">Opt #1 — Aggressive Inlining via Header-Only CUDA Design</h2>

                <p>
                    In CUDA, performance often hinges on inlining. Unlike traditional C++, CUDA's
                    <code>__device__</code>
                    and <code>__host__ __device__</code> functions need to be visible at compile time for the compiler
                    to
                    inline them. Initially, I followed a standard C++ pattern: defining classes in <code>.cuh</code>
                    headers
                    and implementing them in separate <code>.cu</code> files.
                </p>

                <p>
                    That design turned out to be <strong>devastating for performance</strong>. NVCC wasn't able to
                    inline
                    key device functions, resulting in excessive register spilling, increased launch overhead, and
                    significant slowdown — even in release builds.
                </p>

                <p>
                    After switching to a <strong>header-only design</strong> (all device code inlined in
                    <code>.cuh</code>,
                    <i>well</i>, <code>.h</code> headers), everything changed: NVCC inlined everything into the
                    rendering
                    mega-kernel in release mode,
                    minimizing register usage and boosting performance.
                </p>

                <h3>Why CUDA Header-Only Design Matters</h3>
                <ol>
                    <li>
                        <strong>Limited Device Function Linkage:</strong> Device functions need to be visible at compile
                        time to be inlined. CUDA doesn't support separate compilation and linking as robustly as C++ for
                        device code.
                    </li>
                    <li>
                        <strong>Relocatable Device Code (RDC):</strong> You can enable it using <code>-rdc=true</code>,
                        but:
                        <ul>
                            <li>Compiles much slower.</li>
                            <li>Introduces link-time complexity.</li>
                            <li>May reduce inlining and hurt performance.</li>
                        </ul>
                    </li>
                    <li>
                        <strong>Inlining = Performance:</strong> For GPU kernels — especially mega-kernels in a path
                        tracer
                        — aggressive inlining means:
                        <ul>
                            <li>Fewer spills.</li>
                            <li>Less register pressure.</li>
                            <li>Better instruction scheduling.</li>
                        </ul>
                    </li>
                </ol>

                <h3>Before vs After</h3>

                <div class="perf-table-container">
                    <table class="perf-table glow-table">
                        <thead>
                            <tr>
                                <th>Design</th>
                                <th>Inlining</th>
                                <th>Register Spills</th>
                                <th>Compile Time</th>
                                <th>Runtime Performance</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td><code>.cu</code> per class</td>
                                <td>Poor</td>
                                <td>High</td>
                                <td>short</td>
                                <td class="bad">Slow</td>
                            </tr>
                            <tr>
                                <td><code>.cuh</code> header-only</td>
                                <td>Excellent</td>
                                <td>Minimal</td>
                                <td>Long</td>
                                <td class="good">Fast</td>
                            </tr>
                        </tbody>
                    </table>
                </div>

                <p class="perf-note">
                    ✱ Verdict: <em>Go header-only for all device code unless you absolutely need RDC. Let the
                        compiler
                        see
                        everything. Let it inline everything.</em>
                </p>

            </section>

            <section class="section-header">
                <h2>Opt #2 — Killing Recursion with an Explicit Stack</h2>

                <p>To eliminate recursion and cut down register pressure, I rewrote the BVH traversal to use an
                    <strong>explicit stack in registers</strong>. The old code relied on a clean recursive structure
                    like
                    this:
                </p>

                <pre><code class="language-cpp">bool BVHNode::Hit(const Ray& r, float tMin, float tMax, HitRecord& rec) const
{
    if (!m_Box.Hit(r, tMin, tMax))
        return false;

    bool hitLeft  = m_Left->Hit(r, tMin, tMax, rec);
    bool hitRight = m_Right->Hit(r, tMin, hitLeft ? rec.T : tMax, rec);

    return hitLeft || hitRight;
}
</code></pre>

                <p>Readable? Yes. GPU-friendly? Not at all. Every call stacks up ray data, bounding boxes, hit records —
                    and
                    on a GPU, that means <strong>registers and stack memory</strong> fill up fast.</p>

                <p>The new version looks like this:</p>

                <pre><code class="language-cpp">__device__ bool Hit(const Ray& r, const Float tMin, Float tMax, HitRecord& rec) const
{
    Hittable* stack[16];
    int		  stack_ptr      = 0;
    bool	  hit_anything	 = false;
    Float	  closest_so_far = tMax;

    // Push root children (right first, then left to process left first)
    stack[stack_ptr++] = m_Right;
    stack[stack_ptr++] = m_Left;

    while (stack_ptr > 0)
    {
        Hittable* node = stack[--stack_ptr];

        // Early out: Skip nodes whose AABB doesn't intersect [tMin, closest_so_far]
        AABB box;
        node->GetBoundingBox(0, 0, box);
        if (!box.Hit(r, tMin, closest_so_far))
            continue;

        if (node->IsLeaf())
        {
            HitRecord temp_rec;
            if (node->Hit(r, tMin, closest_so_far, temp_rec))
            {
                hit_anything   = true;
                closest_so_far = temp_rec.T;
                rec			   = temp_rec;
            }
        }
        else
        {
            BVHNode* bvh_node = static_cast&lt;BVHNode*&gt;(node);
            // Push children in reverse order (right first, left next)
            stack[stack_ptr++] = bvh_node->m_Right;
            stack[stack_ptr++] = bvh_node->m_Left;
        }
    }
    return hit_anything;
}
</code></pre>

                <h3>Comparison</h3>
                <div class="perf-table-container">
                    <table class="perf-table glow-table">
                        <thead>
                            <tr>
                                <th>Metric</th>
                                <th>Before</th>
                                <th>After</th>
                                <th>Improvement</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td>Frame Time</td>
                                <td>2.5s</td>
                                <td>300ms</td>
                                <td class="improvement">-2.2s (-88%)</td>
                            </tr>
                            <tr>
                                <td>Stack Memory/Thread</td>
                                <td>High (recursive, unbounded)</td>
                                <td>Low (fixed-size array)</td>
                                <td class="improvement">Predictable, no dynamic stack size needed</td>
                            </tr>
                            <tr>
                                <td>Register Pressure</td>
                                <td>High (per recursion level)</td>
                                <td>Lower (single loop, reused variables)</td>
                                <td class="improvement">Fewer spills, higher occupancy</td>
                            </tr>
                            <tr>
                                <td>Control Flow</td>
                                <td>Deep recursion, many branches</td>
                                <td>Flat loop, fewer branches</td>
                                <td class="improvement">Less warp divergence</td>
                            </tr>
                            <tr>
                                <td>Debuggability</td>
                                <td>Hard (stack overflows, deep call stacks)</td>
                                <td>Easy (explicit stack, easier to trace)</td>
                                <td class="improvement">Simpler to debug and profile</td>
                            </tr>
                            <tr>
                                <td>Occupancy</td>
                                <td>Lower (due to stack/register usage)</td>
                                <td>Higher (more threads per SM)</td>
                                <td class="improvement">Better GPU utilization</td>
                            </tr>
                        </tbody>
                    </table>
                </div>


                <p>Now the traversal is entirely iterative, using a compact array on the stack (16 elements max
                    depending on
                    how many nodes there are) and
                    minimizing memory overhead. </p>

                <p>The key improvements:</p>

                <ul>
                    <li><strong>No recursion:</strong> No stack growth, no call overhead, no nested register use.
                    </li>
                    <li><strong>Warp-coherent traversal:</strong> Front-to-back traversal increases chances of early
                        exit,
                        which avoids extra intersection tests.</li>
                </ul>

                <p>This one change gave me a big win in performance and stability — no more surprise stack overflows
                    or
                    slowdowns due to spills.</p>
            </section>

            <section class="section-header">
                <h2 class="optimization-title">Opt #3 — Don't Recompute What You Already Know</h2>
                <p>
                    Here's a simple but powerful axiom in real-time ray tracing:
                    <strong>Precompute what doesn't change.</strong> If you know you're going to need a value frequently
                    — especially one that's expensive to compute — then compute it once, store it, and reuse it.
                </p>

                <p>
                    Take the bounding box of a scene or a node in the BVH. If it's built once during scene setup and
                    never changes, there's no reason to recompute it every time a ray passes through. That's just
                    wasting cycles.
                </p>

                <p>
                    For example, this code:
                </p>

                <pre><code class="language-cpp">__device__ AABB HittableList::GetBoundingBox() const
{
    AABB outputBox;
    AABB tempBox;
    bool firstBox = true;

    for (uint32_t i = 0; i < m_Count; i++)
    {
        tempBox   = m_Objects[i]->GetBoundingBox(time0, time1);
        outputBox = firstBox ? tempBox : SurroundingBox(outputBox, tempBox);
        firstBox  = false;
    }

    return outputBox;
}</code></pre>

                <p>
                    ...does the job, but it's doing way too much. We already know what the result is going to be — it's
                    the
                    same every time. So instead, cache it in the BVH construction stage.
                </p>

                <div class="perf-table-container">
                    <table class="perf-table glow-table">
                        <thead>
                            <tr>
                                <th>Metric</th>
                                <th>Before</th>
                                <th>After</th>
                                <th>Improvement</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td>Frame Time</td>
                                <td>300ms</td>
                                <td>200ms</td>
                                <td class="improvement">-100ms (-33.3%)</td>
                            </tr>
                            <tr>
                                <td>Bounding Box Computations</td>
                                <td>Per ray traversal</td>
                                <td>Once at BVH build</td>
                                <td class="improvement">Eliminated redundant calculations</td>
                            </tr>
                            <tr>
                                <td>Global Memory Accesses</td>
                                <td>Higher</td>
                                <td>Lower</td>
                                <td class="improvement">Fewer loads per ray</td>
                            </tr>
                            <tr>
                                <td>Code Simplicity</td>
                                <td>More complex (repeated logic)</td>
                                <td>Simpler (cached value)</td>
                                <td class="improvement">Cleaner, easier to maintain</td>
                            </tr>
                        </tbody>
                    </table>
                </div>

                <p>
                    Cleaner, faster, and more GPU-friendly.
                </p>

                <p>
                    Little changes like this can mean a lot when you're tracing millions of rays per frame. Always
                    ask
                    yourself: "Can I compute this once and store it?" If yes — do it.
                </p>

                <div class="gotcha-card pro-tip">
                    <div class="gotcha-marker pro-tip-marker"></div>
                    <div class="gotcha-content">
                        <h4>Gotcha: Moving Spheres and Dynamic AABBs</h4>
                        <p>
                            The above optimization—caching bounding boxes—works perfectly for static geometry.
                            However,
                            if your scene contains <strong>moving spheres</strong> (as in the <em>Ray Tracing in One
                                Weekend</em> book), their AABBs depend on time and <strong>cannot be cached</strong>
                            at
                            BVH build time. In that case, you must recompute the bounding box for each ray's time
                            value.
                            The example here uses static spheres intentionally to enable this optimization.
                        </p>

                        <p>
                            <strong>For dynamic AABB: </strong> Maybe you can use linear interpolation (lerp) to
                            blend
                            between two bounding boxes if you
                            want to
                            visualize or animate the transition between them. For example, to interpolate between
                            two
                            AABBs (axis-aligned bounding boxes) `boxA` and `boxB` at time `t` (where `t` is in
                            [0,1]):

                        </p>
                    </div>
                </div>
            </section>

            <section class="section-header">
                <h2>Opt #4 — Early Termination for Low Contributing Rays</h2>
                <p>
                    This one's simple but powerful. If a ray's contribution becomes negligible, we just stop tracing
                    it.
                    There's no point in wasting GPU cycles on a ray that's not adding anything visible to the final
                    image.
                </p>
                <pre><code class="language-cpp">// Early termination for very low contribution
if (fmaxf(cur_attenuation.x, fmaxf(cur_attenuation.y, cur_attenuation.z)) &lt; 0.001f)
    break;
</code></pre>

                <div class="perf-table-container">
                    <table class="perf-table glow-table">
                        <thead>
                            <tr>
                                <th>Metric</th>
                                <th>Before</th>
                                <th>After</th>
                                <th>Improvement</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td>Frame Time</td>
                                <td>200ms</td>
                                <td>160ms</td>
                                <td class="improvement">Less time per frame</td>
                            </tr>
                            <tr>
                                <td>Average Ray Depth</td>
                                <td>more</td>
                                <td>less</td>
                                <td class="improvement">Less depth per ray</td>
                            </tr>
                            <tr>
                                <td>Noise</td>
                                <td>Low</td>
                                <td>Slightly higher</td>
                                <td class="improvement">More noise (acceptable)</td>
                            </tr>
                        </tbody>
                    </table>
                </div>

            </section>

            <section class="section-header">
                <h2>Opt #5 — Russian Roulette</h2>
                <p>
                    Early termination is good — but we can go further with <strong>Russian Roulette</strong>. After a
                    few bounces, we probabilistically decide whether a ray should continue or not, based on its current
                    energy.
                    This avoids wasting time on rays that contribute very little, while still preserving the statistical
                    integrity of the image.
                </p>
                <pre><code class="language-cpp">// Russian Roulette
float surviveProbablity = fmaxf(cur_attenuation.x, fmaxf(cur_attenuation.y, cur_attenuation.z));
if (i > 3) {
    if (curand_uniform(&state) > surviveProbablity)
        break;
    cur_attenuation /= surviveProbablity;
}
</code></pre>

                <div class="perf-table-container">
                    <table class="perf-table glow-table">
                        <thead>
                            <tr>
                                <th>Metric</th>
                                <th>Before</th>
                                <th>After</th>
                                <th>Improvement</th>
                            </tr>
                        </thead>
                        <tbody>
                            <tr>
                                <td>Frame Time</td>