pc2
diff --git a/‎Project.toml‎
Lines changed: 3 additions & 2 deletions b/‎Project.toml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/GPUInspector.jl‎
Lines changed: 28 additions & 29 deletions b/‎src/GPUInspector.jl‎
Lines changed: 28 additions & 29 deletions
diff --git a/‎test/bandwidth_tests.jl‎
Lines changed: 62 additions & 0 deletions b/‎test/bandwidth_tests.jl‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎test/gpuinfo_tests.jl‎
Lines changed: 6 additions & 0 deletions b/‎test/gpuinfo_tests.jl‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎test/peakflops_tests.jl‎
Lines changed: 17 additions & 0 deletions b/‎test/peakflops_tests.jl‎
Lines changed: 17 additions & 0 deletions
@@ -28,14 +28,15 @@ Glob = "1.3"
 HDF5 = "0.16"
 Reexport = "1.2"
 Requires = "1.3"
-ThreadPinning = "0.3, 0.4"
+ThreadPinning = "0.3, 0.4, 0.5, 0.6"
 UnicodePlots = "2.8"
 julia = "1.6"
 
 [extras]
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
 
 [targets]
-test = ["Test", "InteractiveUtils", "CairoMakie"]
+test = ["Test", "InteractiveUtils", "CairoMakie", "TestItemRunner"]
@@ -22,15 +22,39 @@ using Glob: glob
 
 # export BFloat16 for convenience
 const BFloat16 = CUDA.BFloat16
-export BFloat16
 
 include("UnitPrefixedBytes.jl")
-export UnitPrefixedBytes, B, KB, MB, GB, TB, KiB, MiB, GiB, TiB
-export bytes, simplify, change_base, value
 include("cuda_wrappers.jl")
-export get_temperatures, get_power_usages, get_gpu_utilizations
 include("utility.jl")
 include("utility_unroll.jl")
+include("monitoring.jl")
+include("workers.jl")
+include("gpuinfo.jl")
+include("p2p_bandwidth.jl")
+include("host2device_bandwidth.jl")
+include("stresstest_tests.jl")
+include("stresstest.jl")
+include("stresstest_cpu.jl")
+include("peakflops_gpu.jl")
+include("peakflops_gpu_matmul.jl")
+include("peakflops_gpu_fmas.jl")
+include("peakflops_gpu_wmmas.jl")
+include("memory_bandwidth.jl")
+include("memory_bandwidth_saxpy.jl")
+include("hdf5.jl")
+
+function __init__()
+    @require CairoMakie="13f3f980-e62b-5c42-98c6-ff1f3baf88f0" include("requires/cairomakie.jl")
+
+    if CUDA.functional()
+        toggle_tensorcoremath(true; verbose=false) # by default, use CUDA.FAST_MATH
+    end
+end
+
+export BFloat16
+export UnitPrefixedBytes, B, KB, MB, GB, TB, KiB, MiB, GiB, TiB
+export bytes, simplify, change_base, value
+export get_temperatures, get_power_usages, get_gpu_utilizations
 export clear_gpu_memory,
     clear_all_gpus_memory,
     cublasGemmEx_wrapper!,
@@ -39,56 +63,31 @@ export clear_gpu_memory,
     hastensorcores, MultiLogger, multi_log
 export get_cpusocket_temperatures, get_cpu_utilizations, get_cpu_utilization
 export logspace
-include("monitoring.jl")
 export MonitoringResults,
     monitoring_start,
     monitoring_stop,
     plot_monitoring_results,
     savefig_monitoring_results,
     livemonitor_temperature,
     livemonitor_powerusage
-include("workers.jl")
 export @worker, @worker_create, @worker_killall
-
-include("gpuinfo.jl")
 export gpuinfo, gpuinfo_p2p_access, gpus
-include("p2p_bandwidth.jl")
 export p2p_bandwidth,
     p2p_bandwidth_all, p2p_bandwidth_bidirectional, p2p_bandwidth_bidirectional_all
-include("host2device_bandwidth.jl")
 export host2device_bandwidth
-include("stresstest_tests.jl")
-include("stresstest.jl")
-include("stresstest_cpu.jl")
 export stresstest, stresstest_cpu
-include("peakflops_gpu.jl")
-include("peakflops_gpu_matmul.jl")
-include("peakflops_gpu_fmas.jl")
-include("peakflops_gpu_wmmas.jl")
 export peakflops_gpu,
     peakflops_gpu_fmas,
     peakflops_gpu_wmmas,
     peakflops_gpu_matmul,
     peakflops_gpu_matmul_graphs,
     peakflops_gpu_matmul_scaling
 export theoretical_peakflops_gpu, theoretical_peakflops_gpu_tensorcores
-include("memory_bandwidth.jl")
-include("memory_bandwidth_saxpy.jl")
 export memory_bandwidth,
     memory_bandwidth_saxpy,
     memory_bandwidth_scaling,
     memory_bandwidth_saxpy_scaling,
     theoretical_memory_bandwidth
-
-include("hdf5.jl")
 export save_monitoring_results, load_monitoring_results
 
-function __init__()
-    @require CairoMakie="13f3f980-e62b-5c42-98c6-ff1f3baf88f0" include("requires/cairomakie.jl")
-
-    if CUDA.functional()
-        toggle_tensorcoremath(true; verbose=false) # by default, use CUDA.FAST_MATH
-    end
-end
-
 end
@@ -0,0 +1,62 @@
+@testitem "p2p_bandwidth" begin
+    using LinearAlgebra
+
+    @testset "unidirectional" begin
+        # p2p_bandwidth
+        @test typeof(p2p_bandwidth(; verbose=false)) == Float64
+        @test 0 ≤ p2p_bandwidth(; verbose=false)
+        # options
+        @test typeof(p2p_bandwidth(MB(100); verbose=false)) == Float64
+        @test typeof(
+            p2p_bandwidth(; src=CuDevice(0), dst=CuDevice(1), verbose=false)
+        ) == Float64
+        @test typeof(p2p_bandwidth(; dtype=Float16, verbose=false)) == Float64
+        @test typeof(p2p_bandwidth(; nbench=10, verbose=false)) == Float64
+        @test typeof(p2p_bandwidth(; hist=true, verbose=true)) == Float64
+        # p2p_bandwidth_all
+        @test typeof(p2p_bandwidth_all(; verbose=false)) ==
+            Matrix{Union{Nothing,Float64}}
+        Mp2p = p2p_bandwidth_all(; verbose=false)
+        @test all(isnothing, diag(Mp2p))
+        @test all(
+            !isnothing(Mp2p[i, j]) for i in axes(Mp2p, 1), j in axes(Mp2p, 2) if i != j
+        )
+    end
+    @testset "bidirectional" begin
+        # p2p_bandwidth_bidirectional
+        @test typeof(p2p_bandwidth_bidirectional(; verbose=false)) == Float64
+        @test 0 ≤ p2p_bandwidth_bidirectional(; verbose=false)
+        # options
+        @test typeof(p2p_bandwidth_bidirectional(MB(100); verbose=false)) == Float64
+        @test typeof(p2p_bandwidth_bidirectional(; dtype=Float16, verbose=false)) ==
+            Float64
+        @test typeof(p2p_bandwidth_bidirectional(; nbench=10, verbose=false)) ==
+            Float64
+        @test typeof(p2p_bandwidth_bidirectional(; hist=true, verbose=true)) ==
+            Float64
+        # p2p_bandwidth_bidirectional_all
+        @test typeof(p2p_bandwidth_bidirectional_all(; verbose=false)) ==
+            Matrix{Union{Nothing,Float64}}
+        Mp2p = p2p_bandwidth_bidirectional_all(; verbose=false)
+        @test all(isnothing, diag(Mp2p))
+        @test all(
+            !isnothing(Mp2p[i, j]) for i in axes(Mp2p, 1), j in axes(Mp2p, 2) if i != j
+        )
+    end
+end
+
+@testitem "host2device_bandwidth" begin
+    @test isnothing(host2device_bandwidth())
+    @test isnothing(host2device_bandwidth(MB(100)))
+    @test isnothing(host2device_bandwidth(; dtype=Float16))
+end
+
+@testitem "memory_bandwidth" begin
+    @test typeof(memory_bandwidth()) == Float64
+    @test typeof(memory_bandwidth(MiB(10))) == Float64
+    @test typeof(memory_bandwidth(; dtype=Float32)) == Float64
+
+    @test typeof(memory_bandwidth_saxpy()) == Float64
+    @test typeof(memory_bandwidth_saxpy(; size=2^20*2)) == Float64
+    @test typeof(memory_bandwidth_saxpy(; dtype=Float32)) == Float64
+end
@@ -0,0 +1,6 @@
+@testitem "gpuinfo / gpus" begin
+    @test isnothing(gpus())
+    @test isnothing(gpuinfo())
+    @test isnothing(gpuinfo(0))
+    @test isnothing(gpuinfo(device()))
+end
@@ -0,0 +1,17 @@
+@testset "peakflops_gpu (CUDA cores)" begin
+    @test typeof(peakflops_gpu(; verbose=false, tensorcores=false)) == Float64
+    @test typeof(peakflops_gpu(; dtype=Float32, verbose=false, tensorcores=false)) == Float64
+    @test typeof(peakflops_gpu(; dtype=Float64, verbose=false, tensorcores=false)) == Float64
+end
+
+@testset "peakflops_gpu (Tensor cores)" begin
+    @test typeof(peakflops_gpu(; verbose=false, tensorcores=true)) == Float64
+    @test typeof(peakflops_gpu(; dtype=Float16, verbose=false, tensorcores=true)) == Float64
+end
+
+@testset "peakflops_gpu_matmul / scaling" begin
+    @test typeof(peakflops_gpu_matmul(; verbose=false)) == Float64
+    @test typeof(peakflops_gpu_matmul(; size=1024, dtype=Float64, verbose=false)) == Float64
+    @test typeof(peakflops_gpu_matmul(; nmatmuls=2, nbench=2, verbose=false)) == Float64
+    @test typeof(peakflops_gpu_matmul_scaling(; verbose=false)) == Tuple{Vector{Int64}, Vector{Float64}}
+end