Skip to content

Commit bb09ff2

Browse files
committed
Enhanced examples.
1 parent a2cd7e2 commit bb09ff2

File tree

3 files changed

+112
-110
lines changed

3 files changed

+112
-110
lines changed

TutorialApp/AcceleratorExample.cs

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -77,19 +77,22 @@ private static void GPUWorkChung(ArrayView2D<byte, Stride2D.DenseY> srcArray,
7777
/// This method uses a classical ILGPU execution pattern, which relies on the ILGPU
7878
/// internal cache mechanism to load the kernel efficiently.
7979
/// </summary>
80-
private static float[,] DoClassicalPattern(Accelerator a, Index2D dataSize)
80+
private static (long, float[,]) DoClassicalPattern(Accelerator a, Index2D dataSize, Stopwatch sw)
8181
{
8282
float[,] result = new float[dataSize.X, dataSize.Y];
8383
using MemoryBuffer2D<byte, Stride2D.DenseY> srcArray = a.Allocate2DDenseY<byte>(dataSize);
8484
using MemoryBuffer2D<float, Stride2D.DenseY> dstArray = a.Allocate2DDenseY<float>(dataSize);
8585
srcArray.MemSet(128);
8686

87+
sw.Reset();
88+
sw.Start();
8789
//////////////////////////////////////////////////////
8890
// The Classical pattern means to load kernel relying on ILGPU internal cache mechanism.
8991
// Unfortunately there must be an internal bug because in most situations
9092
// kernel seems to be compiled again and again.
9193
var kernel = a.LoadStreamKernel<ArrayView2D<byte, Stride2D.DenseY> , ArrayView2D<float, Stride2D.DenseY>>(GPUWorkChung);
9294
//////////////////////////////////////////////////////
95+
sw.Stop();
9396

9497
//Execute the kernel with the specified grid and group dimensions.
9598
kernel(new KernelConfig(a.MaxNumGroupsExtent.Item1, a.MaxNumGroupsExtent.Item2),
@@ -98,7 +101,7 @@ private static void GPUWorkChung(ArrayView2D<byte, Stride2D.DenseY> srcArray,
98101
);
99102
a.Synchronize();
100103
dstArray.CopyToCPU(result);
101-
return result;
104+
return (sw.ElapsedMilliseconds, result);
102105
}
103106

104107
/// <summary>
@@ -107,13 +110,15 @@ private static void GPUWorkChung(ArrayView2D<byte, Stride2D.DenseY> srcArray,
107110
/// The difference is only in utilizing the cache of already compiled kernels to prevent
108111
/// known bug in the ILGPU internal cache mechanism.
109112
/// </summary>
110-
private static float[,] DoGPUWrappedAcceleratorPattern(GPUWrappedAccelerator a, Index2D dataSize)
113+
private static (long, float[,]) DoGPUWrappedAcceleratorPattern(GPUWrappedAccelerator a, Index2D dataSize, Stopwatch sw)
111114
{
112115
float[,] result = new float[dataSize.X, dataSize.Y];
113116
using MemoryBuffer2D<byte, Stride2D.DenseY> srcArray = a.AccelObj.Allocate2DDenseY<byte>(dataSize);
114117
using MemoryBuffer2D<float, Stride2D.DenseY> dstArray = a.AccelObj.Allocate2DDenseY<float>(dataSize);
115118
srcArray.MemSet(128);
116119

120+
sw.Reset();
121+
sw.Start();
117122
//////////////////////////////////////////////////////
118123
// The GPUWrappedAccelerator pattern means to load compiled kernel
119124
// using the accelerator's pinned cache of already compiled named kernels.
@@ -127,6 +132,7 @@ private static void GPUWorkChung(ArrayView2D<byte, Stride2D.DenseY> srcArray,
127132
() => a.AccelObj.LoadStreamKernel<ArrayView2D<byte, Stride2D.DenseY>, ArrayView2D<float, Stride2D.DenseY>>(GPUWorkChung)
128133
);
129134
///////////////////////////////////////////////////////
135+
sw.Stop();
130136

131137
//Execute the kernel with the specified grid and group dimensions.
132138
kernel(a.GetKernelConfig(srcArray.Length, true),
@@ -135,7 +141,7 @@ private static void GPUWorkChung(ArrayView2D<byte, Stride2D.DenseY> srcArray,
135141
);
136142
a.AccelObj.Synchronize();
137143
dstArray.CopyToCPU(result);
138-
return result;
144+
return (sw.ElapsedMilliseconds, result);
139145
}
140146

141147
/// <summary>
@@ -151,39 +157,39 @@ public void Run()
151157
int repetitions = 1000;
152158
Index2D dataSize = new Index2D(128, 512);
153159

154-
void PerformClassicalPatternTest()
160+
long PerformClassicalPatternTest()
155161
{
156162
using Accelerator accelerator = _device.CreateAccelerator(_context);
163+
long sumMilliseconds = 0;
157164
for (int i = 0; i < repetitions; i++)
158165
{
159-
_ = DoClassicalPattern(accelerator, dataSize);
166+
long elapsedMilliseconds = 0;
167+
(elapsedMilliseconds, _) = DoClassicalPattern(accelerator, dataSize, sw);
168+
sumMilliseconds += elapsedMilliseconds;
160169
}
170+
return sumMilliseconds;
161171
}
162172

163-
void PerformGPUWrappedAcceleratorPatternTest()
173+
long PerformGPUWrappedAcceleratorPatternTest()
164174
{
165175
using GPUWrappedAccelerator accelerator = new(_context, _device);
176+
long sumMilliseconds = 0;
166177
for (int i = 0; i < repetitions; i++)
167178
{
168-
_ = DoGPUWrappedAcceleratorPattern(accelerator, dataSize);
179+
long elapsedMilliseconds = 0;
180+
(elapsedMilliseconds, _) = DoGPUWrappedAcceleratorPattern(accelerator, dataSize, sw);
181+
sumMilliseconds += elapsedMilliseconds;
169182
}
183+
return sumMilliseconds;
170184
}
171185

172-
Console.WriteLine("Classical pattern test started...");
173-
sw.Reset();
174-
sw.Start();
175-
PerformClassicalPatternTest();
176-
sw.Stop();
177-
long classicalPatternTime = sw.ElapsedMilliseconds;
178-
Console.WriteLine($" Execution took {classicalPatternTime} ms.");
179-
180186
Console.WriteLine("GPUWrappedAccelerator pattern test started...");
181-
sw.Reset();
182-
sw.Start();
183-
PerformGPUWrappedAcceleratorPatternTest();
184-
sw.Stop();
185-
long gpuWrappedAcceleratorPatternTime = sw.ElapsedMilliseconds;
187+
long gpuWrappedAcceleratorPatternTime = PerformGPUWrappedAcceleratorPatternTest();
186188
Console.WriteLine($" Execution took {gpuWrappedAcceleratorPatternTime} ms.");
189+
190+
Console.WriteLine("Classical pattern test started...");
191+
long classicalPatternTime = PerformClassicalPatternTest();
192+
Console.WriteLine($" Execution took {classicalPatternTime} ms.");
187193

188194
double ratio = Math.Round(classicalPatternTime >= gpuWrappedAcceleratorPatternTime ?
189195
(double)classicalPatternTime / (double)gpuWrappedAcceleratorPatternTime
@@ -192,7 +198,7 @@ void PerformGPUWrappedAcceleratorPatternTest()
192198
MidpointRounding.AwayFromZero
193199
);
194200
string result = classicalPatternTime >= gpuWrappedAcceleratorPatternTime ? "faster" : "slower";
195-
Console.WriteLine($"Execution using the GPUWrappedAccelerator pattern is about {ratio} times {result} than execution using Classical pattern.");
201+
Console.WriteLine($"Using the GPUWrappedAccelerator pattern to load kernel is about {ratio} times {result} than when using Classical pattern.");
196202

197203
Console.WriteLine();
198204
Console.WriteLine();

TutorialApp/AllocatorExample.cs

Lines changed: 40 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -14,41 +14,48 @@
1414
namespace TutorialApp
1515
{
1616
/// <summary>
17-
/// Demonstrates the use of GPU and CPU parallel processing for neighbor summation in 2D arrays.
17+
/// Demonstrates the use of GPU and CPU parallel processing for neighbor summation in six 2D arrays
18+
/// using different GPU allocation modes and measuring performance.
19+
/// GPU processing uses the default stream on particular GPU accelerator.
1820
/// </summary>
19-
/// <remarks>The <see cref="AllocatorExample"/> class initializes large and small 2D arrays of random
20-
/// float values and provides functionality to compute the sum of neighboring elements for each element in the
21-
/// arrays. It leverages both CPU and GPU resources for parallel processing, depending on the current mode of the
22-
/// <see cref="GPUAllocator"/> singleton. The class also demonstrates switching between different GPU allocation
23-
/// modes and measuring performance.</remarks>
2421
public class AllocatorExample
2522
{
26-
private readonly float[,] _big2DArrayOfFloats;
27-
private readonly float[,] _small2DArrayOfFloats;
23+
private readonly float[,] _2DArrayOfFloats1;
24+
private readonly float[,] _2DArrayOfFloats2;
25+
private readonly float[,] _2DArrayOfFloats3;
26+
private readonly float[,] _2DArrayOfFloats4;
27+
private readonly float[,] _2DArrayOfFloats5;
28+
private readonly float[,] _2DArrayOfFloats6;
2829

2930
/// <summary>
3031
/// Initializes a new instance of the <see cref="AllocatorExample"/> class.
3132
/// </summary>
32-
/// <remarks>This constructor initializes two 2D arrays
33-
/// populated with random float values between 0.0 and 1.0.</remarks>
33+
/// <remarks>This constructor initializes six 2D arrays
34+
/// populated with random float values between 0.0 and 1.0.
35+
/// Constructor also reports available
36+
/// GPUs on host machine (for information only).</remarks>
3437
public AllocatorExample()
3538
{
3639
Console.Clear();
3740
Random rand = new();
38-
_big2DArrayOfFloats = new float[1080, 1920];
39-
for (int i = 0; i < _big2DArrayOfFloats.GetLength(0); i++)
41+
int height = 600;
42+
int width = 800;
43+
_2DArrayOfFloats1 = new float[height, width];
44+
_2DArrayOfFloats2 = new float[height, width];
45+
_2DArrayOfFloats3 = new float[height, width];
46+
_2DArrayOfFloats4 = new float[height, width];
47+
_2DArrayOfFloats5 = new float[height, width];
48+
_2DArrayOfFloats6 = new float[height, width];
49+
for (int i = 0; i < height; i++)
4050
{
41-
for (int j = 0; j < _big2DArrayOfFloats.GetLength(1); j++)
51+
for (int j = 0; j < width; j++)
4252
{
43-
_big2DArrayOfFloats[i, j] = (float)rand.NextDouble();
44-
}
45-
}
46-
_small2DArrayOfFloats = new float[600, 800];
47-
for (int i = 0; i < _small2DArrayOfFloats.GetLength(0); i++)
48-
{
49-
for (int j = 0; j < _small2DArrayOfFloats.GetLength(1); j++)
50-
{
51-
_small2DArrayOfFloats[i, j] = (float)rand.NextDouble();
53+
_2DArrayOfFloats1[i, j] = (float)rand.NextDouble();
54+
_2DArrayOfFloats2[i, j] = (float)rand.NextDouble();
55+
_2DArrayOfFloats3[i, j] = (float)rand.NextDouble();
56+
_2DArrayOfFloats4[i, j] = (float)rand.NextDouble();
57+
_2DArrayOfFloats5[i, j] = (float)rand.NextDouble();
58+
_2DArrayOfFloats6[i, j] = (float)rand.NextDouble();
5259
}
5360
}
5461
Console.WriteLine("Available GPUs");
@@ -66,7 +73,7 @@ public AllocatorExample()
6673
/// <remarks>This method processes the input array using either CPU or GPU resources, depending on
6774
/// availability. If a GPU is available, the computation is offloaded to the GPU for improved performance.
6875
/// Otherwise, the computation is performed on the CPU using parallel processing. The radius for neighbor
69-
/// summation is fixed at 5. Neighboring elements are considered only if they fall within the bounds of the
76+
/// summation is fixed at 5. Neighboring elements are considered only if they fall within the bounds of the
7077
/// input array. The method is thread-safe and can be used in multi-threaded environments.</remarks>
7178
/// <param name="threadName">The name of the thread or task performing the operation, used for logging purposes.</param>
7279
/// <param name="input">A 2D array of floating-point numbers representing the input data. Must not be null.</param>
@@ -198,20 +205,11 @@ static void GPUWorkChung(ArrayView2D<float, Stride2D.DenseY> input, ArrayView2D<
198205

199206
/// <summary>
200207
/// Executes a sequence of six parallel operations, each performing a neighbor sum calculation on specified 2D
201-
/// arrays of floating-point numbers. One operation is performed on a large array, while the other five
202-
/// on small array.
208+
/// array of floating-point numbers.
203209
/// </summary>
204210
/// <remarks>This method utilizes <see cref="System.Threading.Tasks.Parallel.Invoke"/> to execute
205211
/// multiple neighbor sum calculations concurrently. The results of these calculations are returned as a tuple
206212
/// of six 2D arrays.</remarks>
207-
/// <returns>A tuple containing six 2D arrays of floating-point numbers, where each array represents the result of a
208-
/// neighbor sum calculation performed in parallel. The arrays are returned in the following order: <list
209-
/// type="number"> <item><description>Result of the "T1 big" neighbor sum calculation.</description></item>
210-
/// <item><description>Result of the "T2 small" neighbor sum calculation.</description></item>
211-
/// <item><description>Result of the "T3 small" neighbor sum calculation.</description></item>
212-
/// <item><description>Result of the "T4 small" neighbor sum calculation.</description></item>
213-
/// <item><description>Result of the "T5 small" neighbor sum calculation.</description></item>
214-
/// <item><description>Result of the "T6 small" neighbor sum calculation.</description></item> </list></returns>
215213
private (float[,], float[,], float[,], float[,], float[,], float[,]) ExecuteParallelSequence()
216214
{
217215
Stopwatch sw = new();
@@ -222,12 +220,12 @@ static void GPUWorkChung(ArrayView2D<float, Stride2D.DenseY> input, ArrayView2D<
222220
sw.Start();
223221
// Execute the neighbor sum calculations in parallel
224222
Parallel.Invoke(
225-
() => { resultT1 = NeighborSum($"T1 {_big2DArrayOfFloats.GetLength(0)}x{_big2DArrayOfFloats.GetLength(1)}", _big2DArrayOfFloats); },
226-
() => { resultT2 = NeighborSum($"T2 {_small2DArrayOfFloats.GetLength(0)}x{_small2DArrayOfFloats.GetLength(1)}", _small2DArrayOfFloats); },
227-
() => { resultT3 = NeighborSum($"T3 {_small2DArrayOfFloats.GetLength(0)}x{_small2DArrayOfFloats.GetLength(1)}", _small2DArrayOfFloats); },
228-
() => { resultT4 = NeighborSum($"T4 {_small2DArrayOfFloats.GetLength(0)}x{_small2DArrayOfFloats.GetLength(1)}", _small2DArrayOfFloats); },
229-
() => { resultT5 = NeighborSum($"T5 {_small2DArrayOfFloats.GetLength(0)}x{_small2DArrayOfFloats.GetLength(1)}", _small2DArrayOfFloats); },
230-
() => { resultT6 = NeighborSum($"T6 {_small2DArrayOfFloats.GetLength(0)}x{_small2DArrayOfFloats.GetLength(1)}", _small2DArrayOfFloats); }
223+
() => { resultT1 = NeighborSum($"T1", _2DArrayOfFloats1); },
224+
() => { resultT2 = NeighborSum($"T2", _2DArrayOfFloats2); },
225+
() => { resultT3 = NeighborSum($"T3", _2DArrayOfFloats3); },
226+
() => { resultT4 = NeighborSum($"T4", _2DArrayOfFloats4); },
227+
() => { resultT5 = NeighborSum($"T5", _2DArrayOfFloats5); },
228+
() => { resultT6 = NeighborSum($"T6", _2DArrayOfFloats6); }
231229
);
232230
// Stop measuring time
233231
sw.Stop();
@@ -236,11 +234,11 @@ static void GPUWorkChung(ArrayView2D<float, Stride2D.DenseY> input, ArrayView2D<
236234
}
237235

238236
/// <summary>
239-
/// Executes a series of parallel operations using different GPU allocation modes.
237+
/// Executes a series of parallel calculations using different GPU allocation modes.
240238
/// </summary>
241239
/// <remarks>This method sequentially sets the GPU allocation mode to various configurations
242-
/// (NoAccelerator, MostPowerfulGPU, LeastPowerfulGPU, and Standard) and executes parallel operations for each
243-
/// mode.</remarks>
240+
/// (NoAccelerator, MostPowerfulGPU, LeastPowerfulGPU and Standard) and executes parallel
241+
/// calculations for each mode.</remarks>
244242
public void Run()
245243
{
246244
//Set the GPU allocation mode to NoAccelerator to force CPU processing

0 commit comments

Comments
 (0)